@agentv/core 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,17 +32,21 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
35
36
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
37
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
38
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
39
+ avgToolDurationMs: () => avgToolDurationMs,
38
40
  buildDirectoryChain: () => buildDirectoryChain2,
39
41
  buildPromptInputs: () => buildPromptInputs,
40
42
  buildSearchRoots: () => buildSearchRoots2,
41
43
  computeTraceSummary: () => computeTraceSummary,
42
44
  consumeCodexLogEntries: () => consumeCodexLogEntries,
45
+ consumePiLogEntries: () => consumePiLogEntries,
43
46
  createAgentKernel: () => createAgentKernel,
44
47
  createProvider: () => createProvider,
45
48
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
49
+ explorationRatio: () => explorationRatio,
46
50
  extractCodeBlocks: () => extractCodeBlocks,
47
51
  fileExists: () => fileExists2,
48
52
  findGitRoot: () => findGitRoot,
@@ -56,6 +60,7 @@ __export(index_exports, {
56
60
  isTestMessageRole: () => isTestMessageRole,
57
61
  listTargetNames: () => listTargetNames,
58
62
  loadEvalCases: () => loadEvalCases,
63
+ mergeExecutionMetrics: () => mergeExecutionMetrics,
59
64
  normalizeLineEndings: () => normalizeLineEndings,
60
65
  readJsonFile: () => readJsonFile,
61
66
  readTargetDefinitions: () => readTargetDefinitions,
@@ -66,7 +71,9 @@ __export(index_exports, {
66
71
  resolveTargetDefinition: () => resolveTargetDefinition,
67
72
  runEvalCase: () => runEvalCase,
68
73
  runEvaluation: () => runEvaluation,
69
- subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
74
+ subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
75
+ subscribeToPiLogEntries: () => subscribeToPiLogEntries,
76
+ tokensPerTool: () => tokensPerTool
70
77
  });
71
78
  module.exports = __toCommonJS(index_exports);
72
79
 
@@ -151,6 +158,53 @@ function computeTraceSummary(messages) {
151
158
  errorCount: 0
152
159
  };
153
160
  }
161
+ var DEFAULT_EXPLORATION_TOOLS = [
162
+ "read",
163
+ "grep",
164
+ "glob",
165
+ "search",
166
+ "list",
167
+ "Read",
168
+ "Grep",
169
+ "Glob",
170
+ "WebSearch",
171
+ "WebFetch"
172
+ ];
173
+ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
174
+ if (summary.eventCount === 0) return void 0;
175
+ const explorationCalls = explorationTools.reduce(
176
+ (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
177
+ 0
178
+ );
179
+ return explorationCalls / summary.eventCount;
180
+ }
181
+ function tokensPerTool(summary) {
182
+ if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
183
+ const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
184
+ return totalTokens / summary.eventCount;
185
+ }
186
+ function avgToolDurationMs(summary) {
187
+ if (!summary.toolDurations) return void 0;
188
+ let totalDuration = 0;
189
+ let totalCalls = 0;
190
+ for (const durations of Object.values(summary.toolDurations)) {
191
+ for (const duration of durations) {
192
+ totalDuration += duration;
193
+ totalCalls++;
194
+ }
195
+ }
196
+ if (totalCalls === 0) return void 0;
197
+ return totalDuration / totalCalls;
198
+ }
199
+ function mergeExecutionMetrics(summary, metrics) {
200
+ if (!metrics) return summary;
201
+ return {
202
+ ...summary,
203
+ tokenUsage: metrics.tokenUsage,
204
+ costUsd: metrics.costUsd,
205
+ durationMs: metrics.durationMs
206
+ };
207
+ }
154
208
 
155
209
  // src/evaluation/yaml-parser.ts
156
210
  var import_promises6 = require("fs/promises");
@@ -665,7 +719,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
665
719
  expected = [];
666
720
  for (const item of rawExpected) {
667
721
  if (isJsonObject2(item) && typeof item.tool === "string") {
668
- expected.push({ tool: item.tool });
722
+ let args;
723
+ if (item.args === "any") {
724
+ args = "any";
725
+ } else if (isJsonObject2(item.args)) {
726
+ args = item.args;
727
+ }
728
+ expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
669
729
  }
670
730
  }
671
731
  }
@@ -1940,12 +2000,14 @@ var CliProvider = class {
1940
2000
  `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1941
2001
  );
1942
2002
  }
2003
+ const startTime = Date.now();
1943
2004
  const result = await this.runCommand(renderedCommand, {
1944
2005
  cwd: this.config.cwd,
1945
2006
  env: process.env,
1946
2007
  timeoutMs: this.config.timeoutMs,
1947
2008
  signal: request.signal
1948
2009
  });
2010
+ const measuredDurationMs = Date.now() - startTime;
1949
2011
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1950
2012
  if (request.signal?.aborted) {
1951
2013
  throw new Error("CLI provider request was aborted");
@@ -1964,6 +2026,9 @@ var CliProvider = class {
1964
2026
  const parsed = this.parseOutputContent(responseContent);
1965
2027
  return {
1966
2028
  outputMessages: parsed.outputMessages,
2029
+ tokenUsage: parsed.tokenUsage,
2030
+ costUsd: parsed.costUsd,
2031
+ durationMs: parsed.durationMs ?? measuredDurationMs,
1967
2032
  raw: {
1968
2033
  command: renderedCommand,
1969
2034
  stderr: result.stderr,
@@ -2011,12 +2076,14 @@ var CliProvider = class {
2011
2076
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2012
2077
  );
2013
2078
  }
2079
+ const startTime = Date.now();
2014
2080
  const result = await this.runCommand(renderedCommand, {
2015
2081
  cwd: this.config.cwd,
2016
2082
  env: process.env,
2017
2083
  timeoutMs: this.config.timeoutMs,
2018
2084
  signal: controller.signal
2019
2085
  });
2086
+ const measuredDurationMs = Date.now() - startTime;
2020
2087
  if (result.failed || (result.exitCode ?? 0) !== 0) {
2021
2088
  if (controller.signal.aborted) {
2022
2089
  throw new Error("CLI provider request was aborted");
@@ -2038,11 +2105,13 @@ var CliProvider = class {
2038
2105
  if (missingIds.length > 0) {
2039
2106
  throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2040
2107
  }
2108
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
2041
2109
  const responses = requests.map((request) => {
2042
2110
  const evalCaseId = request.evalCaseId;
2043
2111
  if (!evalCaseId) {
2044
2112
  return {
2045
2113
  outputMessages: [],
2114
+ durationMs: perRequestFallbackMs,
2046
2115
  raw: {
2047
2116
  command: renderedCommand,
2048
2117
  stderr: result.stderr,
@@ -2056,6 +2125,7 @@ var CliProvider = class {
2056
2125
  if (!parsed) {
2057
2126
  return {
2058
2127
  outputMessages: [],
2128
+ durationMs: perRequestFallbackMs,
2059
2129
  raw: {
2060
2130
  command: renderedCommand,
2061
2131
  stderr: result.stderr,
@@ -2067,6 +2137,9 @@ var CliProvider = class {
2067
2137
  }
2068
2138
  return {
2069
2139
  outputMessages: parsed.outputMessages,
2140
+ tokenUsage: parsed.tokenUsage,
2141
+ costUsd: parsed.costUsd,
2142
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
2070
2143
  raw: {
2071
2144
  command: renderedCommand,
2072
2145
  stderr: result.stderr,
@@ -2084,25 +2157,55 @@ var CliProvider = class {
2084
2157
  * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
2085
2158
  * If only 'text' is provided, wrap it in outputMessages.
2086
2159
  * Otherwise, treat the entire content as plain text wrapped in outputMessages.
2160
+ *
2161
+ * Also extracts optional execution metrics:
2162
+ * - token_usage: { input, output, cached? }
2163
+ * - cost_usd: number
2164
+ * - duration_ms: number
2087
2165
  */
2088
2166
  parseOutputContent(content) {
2089
2167
  try {
2090
2168
  const parsed = JSON.parse(content);
2091
2169
  if (typeof parsed === "object" && parsed !== null) {
2092
2170
  const obj = parsed;
2171
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2172
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2173
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2093
2174
  const outputMessages = this.parseOutputMessages(obj.output_messages);
2094
2175
  if (outputMessages && outputMessages.length > 0) {
2095
- return { outputMessages };
2176
+ return { outputMessages, tokenUsage, costUsd, durationMs };
2096
2177
  }
2097
2178
  if ("text" in obj) {
2098
2179
  const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2099
- return { outputMessages: [{ role: "assistant", content: text }] };
2180
+ return {
2181
+ outputMessages: [{ role: "assistant", content: text }],
2182
+ tokenUsage,
2183
+ costUsd,
2184
+ durationMs
2185
+ };
2100
2186
  }
2101
2187
  }
2102
2188
  } catch {
2103
2189
  }
2104
2190
  return { outputMessages: [{ role: "assistant", content }] };
2105
2191
  }
2192
+ /**
2193
+ * Parse token_usage from CLI output.
2194
+ */
2195
+ parseTokenUsage(tokenUsage) {
2196
+ if (typeof tokenUsage !== "object" || tokenUsage === null) {
2197
+ return void 0;
2198
+ }
2199
+ const obj = tokenUsage;
2200
+ if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2201
+ return void 0;
2202
+ }
2203
+ return {
2204
+ input: obj.input,
2205
+ output: obj.output,
2206
+ cached: typeof obj.cached === "number" ? obj.cached : void 0
2207
+ };
2208
+ }
2106
2209
  /**
2107
2210
  * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2108
2211
  */
@@ -2179,6 +2282,9 @@ var CliProvider = class {
2179
2282
  if (records.has(id)) {
2180
2283
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
2181
2284
  }
2285
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2286
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2287
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2182
2288
  const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2183
2289
  let outputMessages;
2184
2290
  if (parsedOutputMessages && parsedOutputMessages.length > 0) {
@@ -2188,7 +2294,10 @@ var CliProvider = class {
2188
2294
  outputMessages = text ? [{ role: "assistant", content: text }] : [];
2189
2295
  }
2190
2296
  records.set(id, {
2191
- outputMessages
2297
+ outputMessages,
2298
+ tokenUsage,
2299
+ costUsd,
2300
+ durationMs
2192
2301
  });
2193
2302
  }
2194
2303
  return records;
@@ -2504,6 +2613,11 @@ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exe
2504
2613
  var WORKSPACE_PREFIX = "agentv-codex-";
2505
2614
  var PROMPT_FILENAME = "prompt.md";
2506
2615
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2616
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2617
+ - Do NOT create any additional output files in the workspace.
2618
+ - All intended file outputs/changes MUST be written in your response.
2619
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2620
+ This is required for evaluation scoring.`;
2507
2621
  var CodexProvider = class {
2508
2622
  id;
2509
2623
  kind = "codex";
@@ -2528,7 +2642,11 @@ var CodexProvider = class {
2528
2642
  const workspaceRoot = await this.createWorkspace();
2529
2643
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2530
2644
  try {
2531
- const promptContent = buildPromptDocument(request, inputFiles);
2645
+ const basePrompt = buildPromptDocument(request, inputFiles);
2646
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2647
+ const promptContent = `${systemPrompt}
2648
+
2649
+ ${basePrompt}`;
2532
2650
  const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
2533
2651
  await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
2534
2652
  const args = this.buildCodexArgs();
@@ -3212,222 +3330,1067 @@ var MockProvider = class {
3212
3330
  }
3213
3331
  };
3214
3332
 
3215
- // src/evaluation/providers/targets.ts
3333
+ // src/evaluation/providers/pi-coding-agent.ts
3334
+ var import_node_child_process3 = require("child_process");
3335
+ var import_node_crypto2 = require("crypto");
3336
+ var import_node_fs4 = require("fs");
3337
+ var import_promises10 = require("fs/promises");
3338
+ var import_node_os3 = require("os");
3216
3339
  var import_node_path11 = __toESM(require("path"), 1);
3217
- var import_zod = require("zod");
3218
- var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
3219
- "PROMPT",
3220
- "GUIDELINES",
3221
- "EVAL_ID",
3222
- "ATTEMPT",
3223
- "FILES",
3224
- "OUTPUT_FILE"
3225
- ]);
3226
- var BASE_TARGET_SCHEMA = import_zod.z.object({
3227
- name: import_zod.z.string().min(1, "target name is required"),
3228
- provider: import_zod.z.string().min(1, "provider is required"),
3229
- judge_target: import_zod.z.string().optional(),
3230
- workers: import_zod.z.number().int().min(1).optional()
3231
- }).passthrough();
3232
- var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
3233
- function normalizeAzureApiVersion(value) {
3234
- if (!value) {
3235
- return DEFAULT_AZURE_API_VERSION;
3236
- }
3237
- const trimmed = value.trim();
3238
- if (trimmed.length === 0) {
3239
- return DEFAULT_AZURE_API_VERSION;
3340
+
3341
+ // src/evaluation/providers/pi-log-tracker.ts
3342
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3343
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
3344
+ function getPiLogStore() {
3345
+ const globalObject = globalThis;
3346
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
3347
+ if (existing) {
3348
+ return existing;
3240
3349
  }
3241
- const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
3242
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
3350
+ const created = [];
3351
+ globalObject[GLOBAL_LOGS_KEY2] = created;
3352
+ return created;
3243
3353
  }
3244
- function resolveRetryConfig(target) {
3245
- const maxRetries = resolveOptionalNumber(
3246
- target.max_retries ?? target.maxRetries,
3247
- `${target.name} max retries`
3248
- );
3249
- const initialDelayMs = resolveOptionalNumber(
3250
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
3251
- `${target.name} retry initial delay`
3252
- );
3253
- const maxDelayMs = resolveOptionalNumber(
3254
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
3255
- `${target.name} retry max delay`
3256
- );
3257
- const backoffFactor = resolveOptionalNumber(
3258
- target.retry_backoff_factor ?? target.retryBackoffFactor,
3259
- `${target.name} retry backoff factor`
3260
- );
3261
- const retryableStatusCodes = resolveOptionalNumberArray(
3262
- target.retry_status_codes ?? target.retryStatusCodes,
3263
- `${target.name} retry status codes`
3264
- );
3265
- if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
3266
- return void 0;
3354
+ function getSubscriberStore2() {
3355
+ const globalObject = globalThis;
3356
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
3357
+ if (existing) {
3358
+ return existing;
3267
3359
  }
3268
- return {
3269
- maxRetries,
3270
- initialDelayMs,
3271
- maxDelayMs,
3272
- backoffFactor,
3273
- retryableStatusCodes
3274
- };
3360
+ const created = /* @__PURE__ */ new Set();
3361
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
3362
+ return created;
3275
3363
  }
3276
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
3277
- const parsed = BASE_TARGET_SCHEMA.parse(definition);
3278
- const provider = parsed.provider.toLowerCase();
3279
- const providerBatching = resolveOptionalBoolean(
3280
- parsed.provider_batching ?? parsed.providerBatching
3281
- );
3282
- switch (provider) {
3283
- case "azure":
3284
- case "azure-openai":
3285
- return {
3286
- kind: "azure",
3287
- name: parsed.name,
3288
- judgeTarget: parsed.judge_target,
3289
- workers: parsed.workers,
3290
- providerBatching,
3291
- config: resolveAzureConfig(parsed, env)
3292
- };
3293
- case "anthropic":
3294
- return {
3295
- kind: "anthropic",
3296
- name: parsed.name,
3297
- judgeTarget: parsed.judge_target,
3298
- workers: parsed.workers,
3299
- providerBatching,
3300
- config: resolveAnthropicConfig(parsed, env)
3301
- };
3302
- case "gemini":
3303
- case "google":
3304
- case "google-gemini":
3305
- return {
3306
- kind: "gemini",
3307
- name: parsed.name,
3308
- judgeTarget: parsed.judge_target,
3309
- workers: parsed.workers,
3310
- providerBatching,
3311
- config: resolveGeminiConfig(parsed, env)
3312
- };
3313
- case "codex":
3314
- case "codex-cli":
3315
- return {
3316
- kind: "codex",
3317
- name: parsed.name,
3318
- judgeTarget: parsed.judge_target,
3319
- workers: parsed.workers,
3320
- providerBatching,
3321
- config: resolveCodexConfig(parsed, env)
3322
- };
3323
- case "mock":
3324
- return {
3325
- kind: "mock",
3326
- name: parsed.name,
3327
- judgeTarget: parsed.judge_target,
3328
- workers: parsed.workers,
3329
- providerBatching,
3330
- config: resolveMockConfig(parsed)
3331
- };
3332
- case "vscode":
3333
- case "vscode-insiders":
3334
- return {
3335
- kind: provider,
3336
- name: parsed.name,
3337
- judgeTarget: parsed.judge_target,
3338
- workers: parsed.workers,
3339
- providerBatching,
3340
- config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
3341
- };
3342
- case "cli":
3343
- return {
3344
- kind: "cli",
3345
- name: parsed.name,
3346
- judgeTarget: parsed.judge_target,
3347
- workers: parsed.workers,
3348
- providerBatching,
3349
- config: resolveCliConfig(parsed, env, evalFilePath)
3350
- };
3351
- default:
3352
- throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
3364
+ function notifySubscribers2(entry) {
3365
+ const subscribers = Array.from(getSubscriberStore2());
3366
+ for (const listener of subscribers) {
3367
+ try {
3368
+ listener(entry);
3369
+ } catch (error) {
3370
+ const message = error instanceof Error ? error.message : String(error);
3371
+ console.warn(`Pi log subscriber failed: ${message}`);
3372
+ }
3353
3373
  }
3354
3374
  }
3355
- function resolveAzureConfig(target, env) {
3356
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
3357
- const apiKeySource = target.api_key ?? target.apiKey;
3358
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
3359
- const versionSource = target.version ?? target.api_version;
3360
- const temperatureSource = target.temperature;
3361
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
3362
- const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
3363
- const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
3364
- const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
3365
- const version = normalizeAzureApiVersion(
3366
- resolveOptionalString(versionSource, env, `${target.name} api version`, {
3367
- allowLiteral: true,
3368
- optionalEnv: true
3369
- })
3370
- );
3371
- const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
3372
- const maxOutputTokens = resolveOptionalNumber(
3373
- maxTokensSource,
3374
- `${target.name} max output tokens`
3375
- );
3376
- const retry = resolveRetryConfig(target);
3377
- return {
3378
- resourceName,
3379
- deploymentName,
3380
- apiKey,
3381
- version,
3382
- temperature,
3383
- maxOutputTokens,
3384
- retry
3385
- };
3375
+ function recordPiLogEntry(entry) {
3376
+ getPiLogStore().push(entry);
3377
+ notifySubscribers2(entry);
3386
3378
  }
3387
- function resolveAnthropicConfig(target, env) {
3388
- const apiKeySource = target.api_key ?? target.apiKey;
3389
- const modelSource = target.model ?? target.deployment ?? target.variant;
3390
- const temperatureSource = target.temperature;
3391
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
3392
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
3393
- const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
3394
- const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
3395
- const retry = resolveRetryConfig(target);
3396
- return {
3397
- apiKey,
3398
- model,
3399
- temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
3400
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
3401
- thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
3402
- retry
3403
- };
3379
+ function consumePiLogEntries() {
3380
+ const store = getPiLogStore();
3381
+ if (store.length === 0) {
3382
+ return [];
3383
+ }
3384
+ return store.splice(0, store.length);
3404
3385
  }
3405
- function resolveGeminiConfig(target, env) {
3406
- const apiKeySource = target.api_key ?? target.apiKey;
3407
- const modelSource = target.model ?? target.deployment ?? target.variant;
3408
- const temperatureSource = target.temperature;
3409
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
3410
- const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
3411
- const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
3412
- allowLiteral: true,
3413
- optionalEnv: true
3414
- }) ?? "gemini-2.5-flash";
3415
- const retry = resolveRetryConfig(target);
3416
- return {
3417
- apiKey,
3418
- model,
3419
- temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
3420
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
3421
- retry
3386
+ function subscribeToPiLogEntries(listener) {
3387
+ const store = getSubscriberStore2();
3388
+ store.add(listener);
3389
+ return () => {
3390
+ store.delete(listener);
3422
3391
  };
3423
3392
  }
3424
- function resolveCodexConfig(target, env) {
3425
- const executableSource = target.executable ?? target.command ?? target.binary;
3426
- const argsSource = target.args ?? target.arguments;
3427
- const cwdSource = target.cwd;
3428
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
3393
+
3394
+ // src/evaluation/providers/pi-coding-agent.ts
3395
+ var WORKSPACE_PREFIX2 = "agentv-pi-";
3396
+ var PROMPT_FILENAME2 = "prompt.md";
3397
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
3398
+ - Do NOT create any additional output files in the workspace.
3399
+ - All intended file outputs/changes MUST be written in your response.
3400
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3401
+ This is required for evaluation scoring.`;
3402
+ var PiCodingAgentProvider = class {
3403
+ id;
3404
+ kind = "pi-coding-agent";
3405
+ targetName;
3406
+ supportsBatch = false;
3407
+ config;
3408
+ runPi;
3409
+ constructor(targetName, config, runner = defaultPiRunner) {
3410
+ this.id = `pi-coding-agent:${targetName}`;
3411
+ this.targetName = targetName;
3412
+ this.config = config;
3413
+ this.runPi = runner;
3414
+ }
3415
+ async invoke(request) {
3416
+ if (request.signal?.aborted) {
3417
+ throw new Error("Pi coding agent request was aborted before execution");
3418
+ }
3419
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
3420
+ const workspaceRoot = await this.createWorkspace();
3421
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
3422
+ try {
3423
+ const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3424
+ await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
3425
+ const args = this.buildPiArgs(request.question, inputFiles);
3426
+ const cwd = this.resolveCwd(workspaceRoot);
3427
+ const result = await this.executePi(args, cwd, request.signal, logger);
3428
+ if (result.timedOut) {
3429
+ throw new Error(
3430
+ `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
3431
+ );
3432
+ }
3433
+ if (result.exitCode !== 0) {
3434
+ const detail = pickDetail2(result.stderr, result.stdout);
3435
+ const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3436
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3437
+ }
3438
+ const parsed = parsePiJsonl(result.stdout);
3439
+ const outputMessages = extractOutputMessages(parsed);
3440
+ const assistantText = extractAssistantText2(outputMessages);
3441
+ return {
3442
+ raw: {
3443
+ response: parsed,
3444
+ stdout: result.stdout,
3445
+ stderr: result.stderr,
3446
+ exitCode: result.exitCode,
3447
+ args,
3448
+ executable: this.config.executable,
3449
+ promptFile,
3450
+ workspace: workspaceRoot,
3451
+ inputFiles,
3452
+ logFile: logger?.filePath
3453
+ },
3454
+ outputMessages
3455
+ };
3456
+ } finally {
3457
+ await logger?.close();
3458
+ await this.cleanupWorkspace(workspaceRoot);
3459
+ }
3460
+ }
3461
+ resolveCwd(workspaceRoot) {
3462
+ if (!this.config.cwd) {
3463
+ return workspaceRoot;
3464
+ }
3465
+ return import_node_path11.default.resolve(this.config.cwd);
3466
+ }
3467
+ buildPiArgs(prompt, inputFiles) {
3468
+ const args = [];
3469
+ if (this.config.provider) {
3470
+ args.push("--provider", this.config.provider);
3471
+ }
3472
+ if (this.config.model) {
3473
+ args.push("--model", this.config.model);
3474
+ }
3475
+ if (this.config.apiKey) {
3476
+ args.push("--api-key", this.config.apiKey);
3477
+ }
3478
+ args.push("--mode", "json");
3479
+ args.push("--print");
3480
+ args.push("--no-session");
3481
+ if (this.config.tools) {
3482
+ args.push("--tools", this.config.tools);
3483
+ }
3484
+ if (this.config.thinking) {
3485
+ args.push("--thinking", this.config.thinking);
3486
+ }
3487
+ if (this.config.args && this.config.args.length > 0) {
3488
+ args.push(...this.config.args);
3489
+ }
3490
+ if (inputFiles && inputFiles.length > 0) {
3491
+ for (const file of inputFiles) {
3492
+ args.push(`@${file}`);
3493
+ }
3494
+ }
3495
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
3496
+ const fullPrompt = `${systemPrompt}
3497
+
3498
+ ${prompt}`;
3499
+ const escapedPrompt = escapeAtSymbols(fullPrompt);
3500
+ args.push(escapedPrompt);
3501
+ return args;
3502
+ }
3503
+ async executePi(args, cwd, signal, logger) {
3504
+ try {
3505
+ return await this.runPi({
3506
+ executable: this.config.executable,
3507
+ args,
3508
+ cwd,
3509
+ timeoutMs: this.config.timeoutMs,
3510
+ env: this.buildEnv(),
3511
+ signal,
3512
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
3513
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
3514
+ });
3515
+ } catch (error) {
3516
+ const err = error;
3517
+ if (err.code === "ENOENT") {
3518
+ throw new Error(
3519
+ `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
3520
+ );
3521
+ }
3522
+ throw error;
3523
+ }
3524
+ }
3525
+ buildEnv() {
3526
+ const env = { ...process.env };
3527
+ if (this.config.apiKey) {
3528
+ const provider = this.config.provider?.toLowerCase() ?? "google";
3529
+ switch (provider) {
3530
+ case "google":
3531
+ case "gemini":
3532
+ env.GEMINI_API_KEY = this.config.apiKey;
3533
+ break;
3534
+ case "anthropic":
3535
+ env.ANTHROPIC_API_KEY = this.config.apiKey;
3536
+ break;
3537
+ case "openai":
3538
+ env.OPENAI_API_KEY = this.config.apiKey;
3539
+ break;
3540
+ case "groq":
3541
+ env.GROQ_API_KEY = this.config.apiKey;
3542
+ break;
3543
+ case "xai":
3544
+ env.XAI_API_KEY = this.config.apiKey;
3545
+ break;
3546
+ case "openrouter":
3547
+ env.OPENROUTER_API_KEY = this.config.apiKey;
3548
+ break;
3549
+ }
3550
+ }
3551
+ return env;
3552
+ }
3553
+ async createWorkspace() {
3554
+ return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
3555
+ }
3556
+ async cleanupWorkspace(workspaceRoot) {
3557
+ try {
3558
+ await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
3559
+ } catch {
3560
+ }
3561
+ }
3562
+ resolveLogDirectory() {
3563
+ if (this.config.logDir) {
3564
+ return import_node_path11.default.resolve(this.config.logDir);
3565
+ }
3566
+ return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3567
+ }
3568
+ async createStreamLogger(request) {
3569
+ const logDir = this.resolveLogDirectory();
3570
+ if (!logDir) {
3571
+ return void 0;
3572
+ }
3573
+ try {
3574
+ await (0, import_promises10.mkdir)(logDir, { recursive: true });
3575
+ } catch (error) {
3576
+ const message = error instanceof Error ? error.message : String(error);
3577
+ console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3578
+ return void 0;
3579
+ }
3580
+ const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
3581
+ try {
3582
+ const logger = await PiStreamLogger.create({
3583
+ filePath,
3584
+ targetName: this.targetName,
3585
+ evalCaseId: request.evalCaseId,
3586
+ attempt: request.attempt,
3587
+ format: this.config.logFormat ?? "summary"
3588
+ });
3589
+ recordPiLogEntry({
3590
+ filePath,
3591
+ targetName: this.targetName,
3592
+ evalCaseId: request.evalCaseId,
3593
+ attempt: request.attempt
3594
+ });
3595
+ return logger;
3596
+ } catch (error) {
3597
+ const message = error instanceof Error ? error.message : String(error);
3598
+ console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
3599
+ return void 0;
3600
+ }
3601
+ }
3602
+ };
3603
+ var PiStreamLogger = class _PiStreamLogger {
3604
+ filePath;
3605
+ stream;
3606
+ startedAt = Date.now();
3607
+ stdoutBuffer = "";
3608
+ stderrBuffer = "";
3609
+ format;
3610
+ constructor(filePath, format) {
3611
+ this.filePath = filePath;
3612
+ this.format = format;
3613
+ this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
3614
+ }
3615
+ static async create(options) {
3616
+ const logger = new _PiStreamLogger(options.filePath, options.format);
3617
+ const header = [
3618
+ "# Pi Coding Agent stream log",
3619
+ `# target: ${options.targetName}`,
3620
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
3621
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
3622
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
3623
+ ""
3624
+ ].filter((line) => Boolean(line));
3625
+ logger.writeLines(header);
3626
+ return logger;
3627
+ }
3628
+ handleStdoutChunk(chunk) {
3629
+ this.stdoutBuffer += chunk;
3630
+ this.flushBuffer("stdout");
3631
+ }
3632
+ handleStderrChunk(chunk) {
3633
+ this.stderrBuffer += chunk;
3634
+ this.flushBuffer("stderr");
3635
+ }
3636
+ async close() {
3637
+ this.flushBuffer("stdout");
3638
+ this.flushBuffer("stderr");
3639
+ this.flushRemainder();
3640
+ await new Promise((resolve, reject) => {
3641
+ this.stream.once("error", reject);
3642
+ this.stream.end(() => resolve());
3643
+ });
3644
+ }
3645
+ writeLines(lines) {
3646
+ for (const line of lines) {
3647
+ this.stream.write(`${line}
3648
+ `);
3649
+ }
3650
+ }
3651
+ flushBuffer(source) {
3652
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
3653
+ const lines = buffer.split(/\r?\n/);
3654
+ const remainder = lines.pop() ?? "";
3655
+ if (source === "stdout") {
3656
+ this.stdoutBuffer = remainder;
3657
+ } else {
3658
+ this.stderrBuffer = remainder;
3659
+ }
3660
+ for (const line of lines) {
3661
+ const formatted = this.formatLine(line, source);
3662
+ if (formatted) {
3663
+ this.stream.write(formatted);
3664
+ this.stream.write("\n");
3665
+ }
3666
+ }
3667
+ }
3668
+ formatLine(rawLine, source) {
3669
+ const trimmed = rawLine.trim();
3670
+ if (trimmed.length === 0) {
3671
+ return void 0;
3672
+ }
3673
+ const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3674
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
3675
+ }
3676
+ flushRemainder() {
3677
+ const stdoutRemainder = this.stdoutBuffer.trim();
3678
+ if (stdoutRemainder.length > 0) {
3679
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
3680
+ if (formatted) {
3681
+ this.stream.write(formatted);
3682
+ this.stream.write("\n");
3683
+ }
3684
+ }
3685
+ const stderrRemainder = this.stderrBuffer.trim();
3686
+ if (stderrRemainder.length > 0) {
3687
+ const formatted = this.formatLine(stderrRemainder, "stderr");
3688
+ if (formatted) {
3689
+ this.stream.write(formatted);
3690
+ this.stream.write("\n");
3691
+ }
3692
+ }
3693
+ this.stdoutBuffer = "";
3694
+ this.stderrBuffer = "";
3695
+ }
3696
+ };
3697
+ function buildLogFilename2(request, targetName) {
3698
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3699
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
3700
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3701
+ const target = sanitizeForFilename2(targetName);
3702
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
3703
+ }
3704
+ function sanitizeForFilename2(value) {
3705
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3706
+ return sanitized.length > 0 ? sanitized : "pi";
3707
+ }
3708
+ function formatElapsed2(startedAt) {
3709
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3710
+ const hours = Math.floor(elapsedSeconds / 3600);
3711
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
3712
+ const seconds = elapsedSeconds % 60;
3713
+ if (hours > 0) {
3714
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3715
+ }
3716
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3717
+ }
3718
+ function formatPiLogMessage(rawLine, source) {
3719
+ const parsed = tryParseJsonValue2(rawLine);
3720
+ if (parsed) {
3721
+ const summary = summarizePiEvent(parsed);
3722
+ if (summary) {
3723
+ return summary;
3724
+ }
3725
+ }
3726
+ if (source === "stderr") {
3727
+ return `stderr: ${rawLine}`;
3728
+ }
3729
+ return rawLine;
3730
+ }
3731
+ function formatPiJsonLog(rawLine) {
3732
+ const parsed = tryParseJsonValue2(rawLine);
3733
+ if (!parsed) {
3734
+ return rawLine;
3735
+ }
3736
+ try {
3737
+ return JSON.stringify(parsed, null, 2);
3738
+ } catch {
3739
+ return rawLine;
3740
+ }
3741
+ }
3742
+ function summarizePiEvent(event) {
3743
+ if (!event || typeof event !== "object") {
3744
+ return void 0;
3745
+ }
3746
+ const record = event;
3747
+ const type = typeof record.type === "string" ? record.type : void 0;
3748
+ if (!type) {
3749
+ return void 0;
3750
+ }
3751
+ switch (type) {
3752
+ case "agent_start":
3753
+ return "agent_start";
3754
+ case "agent_end":
3755
+ return "agent_end";
3756
+ case "turn_start":
3757
+ return "turn_start";
3758
+ case "turn_end":
3759
+ return "turn_end";
3760
+ case "message_start":
3761
+ case "message_end": {
3762
+ const message = record.message;
3763
+ const role = message?.role;
3764
+ return `${type}: ${role}`;
3765
+ }
3766
+ case "message_update": {
3767
+ const event2 = record.assistantMessageEvent;
3768
+ const eventType = event2?.type;
3769
+ if (eventType === "text_delta") {
3770
+ const delta = event2?.delta;
3771
+ if (typeof delta === "string") {
3772
+ const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
3773
+ return `text_delta: ${preview}`;
3774
+ }
3775
+ }
3776
+ return `message_update: ${eventType}`;
3777
+ }
3778
+ default:
3779
+ return type;
3780
+ }
3781
+ }
3782
+ function tryParseJsonValue2(rawLine) {
3783
+ try {
3784
+ return JSON.parse(rawLine);
3785
+ } catch {
3786
+ return void 0;
3787
+ }
3788
+ }
3789
+ function parsePiJsonl(output) {
3790
+ const trimmed = output.trim();
3791
+ if (trimmed.length === 0) {
3792
+ throw new Error("Pi coding agent produced no output");
3793
+ }
3794
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
3795
+ const parsed = [];
3796
+ for (const line of lines) {
3797
+ try {
3798
+ parsed.push(JSON.parse(line));
3799
+ } catch {
3800
+ }
3801
+ }
3802
+ if (parsed.length === 0) {
3803
+ throw new Error("Pi coding agent produced no valid JSON output");
3804
+ }
3805
+ return parsed;
3806
+ }
3807
+ function extractOutputMessages(events) {
3808
+ for (let i = events.length - 1; i >= 0; i--) {
3809
+ const event = events[i];
3810
+ if (!event || typeof event !== "object") {
3811
+ continue;
3812
+ }
3813
+ const record = event;
3814
+ if (record.type !== "agent_end") {
3815
+ continue;
3816
+ }
3817
+ const messages = record.messages;
3818
+ if (!Array.isArray(messages)) {
3819
+ continue;
3820
+ }
3821
+ return messages.map(convertPiMessage).filter((m) => m !== void 0);
3822
+ }
3823
+ const outputMessages = [];
3824
+ for (const event of events) {
3825
+ if (!event || typeof event !== "object") {
3826
+ continue;
3827
+ }
3828
+ const record = event;
3829
+ if (record.type === "turn_end") {
3830
+ const message = record.message;
3831
+ const converted = convertPiMessage(message);
3832
+ if (converted) {
3833
+ outputMessages.push(converted);
3834
+ }
3835
+ }
3836
+ }
3837
+ return outputMessages;
3838
+ }
3839
+ function convertPiMessage(message) {
3840
+ if (!message || typeof message !== "object") {
3841
+ return void 0;
3842
+ }
3843
+ const msg = message;
3844
+ const role = msg.role;
3845
+ if (typeof role !== "string") {
3846
+ return void 0;
3847
+ }
3848
+ const content = extractTextContent(msg.content);
3849
+ const toolCalls = extractToolCalls(msg.content);
3850
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3851
+ const metadata = {};
3852
+ if (msg.api) metadata.api = msg.api;
3853
+ if (msg.provider) metadata.provider = msg.provider;
3854
+ if (msg.model) metadata.model = msg.model;
3855
+ if (msg.usage) metadata.usage = msg.usage;
3856
+ if (msg.stopReason) metadata.stopReason = msg.stopReason;
3857
+ return {
3858
+ role,
3859
+ content,
3860
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
3861
+ timestamp,
3862
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3863
+ };
3864
+ }
3865
+ function extractTextContent(content) {
3866
+ if (typeof content === "string") {
3867
+ return content;
3868
+ }
3869
+ if (!Array.isArray(content)) {
3870
+ return void 0;
3871
+ }
3872
+ const textParts = [];
3873
+ for (const part of content) {
3874
+ if (!part || typeof part !== "object") {
3875
+ continue;
3876
+ }
3877
+ const p = part;
3878
+ if (p.type === "text" && typeof p.text === "string") {
3879
+ textParts.push(p.text);
3880
+ }
3881
+ }
3882
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
3883
+ }
3884
+ function extractToolCalls(content) {
3885
+ if (!Array.isArray(content)) {
3886
+ return [];
3887
+ }
3888
+ const toolCalls = [];
3889
+ for (const part of content) {
3890
+ if (!part || typeof part !== "object") {
3891
+ continue;
3892
+ }
3893
+ const p = part;
3894
+ if (p.type === "tool_use" && typeof p.name === "string") {
3895
+ toolCalls.push({
3896
+ tool: p.name,
3897
+ input: p.input,
3898
+ id: typeof p.id === "string" ? p.id : void 0
3899
+ });
3900
+ }
3901
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
3902
+ const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
3903
+ if (existing) {
3904
+ const idx = toolCalls.indexOf(existing);
3905
+ toolCalls[idx] = {
3906
+ ...existing,
3907
+ output: p.content
3908
+ };
3909
+ }
3910
+ }
3911
+ }
3912
+ return toolCalls;
3913
+ }
3914
+ function extractAssistantText2(messages) {
3915
+ for (let i = messages.length - 1; i >= 0; i--) {
3916
+ const msg = messages[i];
3917
+ if (msg.role === "assistant" && msg.content) {
3918
+ if (typeof msg.content === "string") {
3919
+ return msg.content;
3920
+ }
3921
+ return JSON.stringify(msg.content);
3922
+ }
3923
+ }
3924
+ return "";
3925
+ }
3926
+ function escapeAtSymbols(prompt) {
3927
+ return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3928
+ }
3929
+ function pickDetail2(stderr, stdout) {
3930
+ const errorText = stderr.trim();
3931
+ if (errorText.length > 0) {
3932
+ return errorText;
3933
+ }
3934
+ const stdoutText = stdout.trim();
3935
+ return stdoutText.length > 0 ? stdoutText : void 0;
3936
+ }
3937
+ function formatTimeoutSuffix3(timeoutMs) {
3938
+ if (!timeoutMs || timeoutMs <= 0) {
3939
+ return "";
3940
+ }
3941
+ const seconds = Math.ceil(timeoutMs / 1e3);
3942
+ return ` after ${seconds}s`;
3943
+ }
3944
+ async function defaultPiRunner(options) {
3945
+ return await new Promise((resolve, reject) => {
3946
+ const parts = options.executable.split(/\s+/);
3947
+ const executable = parts[0];
3948
+ const executableArgs = parts.slice(1);
3949
+ const allArgs = [...executableArgs, ...options.args];
3950
+ const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
3951
+ cwd: options.cwd,
3952
+ env: options.env,
3953
+ stdio: ["pipe", "pipe", "pipe"],
3954
+ shell: false
3955
+ });
3956
+ let stdout = "";
3957
+ let stderr = "";
3958
+ let timedOut = false;
3959
+ const onAbort = () => {
3960
+ child.kill("SIGTERM");
3961
+ };
3962
+ if (options.signal) {
3963
+ if (options.signal.aborted) {
3964
+ onAbort();
3965
+ } else {
3966
+ options.signal.addEventListener("abort", onAbort, { once: true });
3967
+ }
3968
+ }
3969
+ let timeoutHandle;
3970
+ if (options.timeoutMs && options.timeoutMs > 0) {
3971
+ timeoutHandle = setTimeout(() => {
3972
+ timedOut = true;
3973
+ child.kill("SIGTERM");
3974
+ }, options.timeoutMs);
3975
+ timeoutHandle.unref?.();
3976
+ }
3977
+ child.stdout.setEncoding("utf8");
3978
+ child.stdout.on("data", (chunk) => {
3979
+ stdout += chunk;
3980
+ options.onStdoutChunk?.(chunk);
3981
+ });
3982
+ child.stderr.setEncoding("utf8");
3983
+ child.stderr.on("data", (chunk) => {
3984
+ stderr += chunk;
3985
+ options.onStderrChunk?.(chunk);
3986
+ });
3987
+ child.stdin.end();
3988
+ const cleanup = () => {
3989
+ if (timeoutHandle) {
3990
+ clearTimeout(timeoutHandle);
3991
+ }
3992
+ if (options.signal) {
3993
+ options.signal.removeEventListener("abort", onAbort);
3994
+ }
3995
+ };
3996
+ child.on("error", (error) => {
3997
+ cleanup();
3998
+ reject(error);
3999
+ });
4000
+ child.on("close", (code) => {
4001
+ cleanup();
4002
+ resolve({
4003
+ stdout,
4004
+ stderr,
4005
+ exitCode: typeof code === "number" ? code : -1,
4006
+ timedOut
4007
+ });
4008
+ });
4009
+ });
4010
+ }
4011
+
4012
+ // src/evaluation/providers/targets.ts
4013
+ var import_node_path12 = __toESM(require("path"), 1);
4014
+ var import_zod = require("zod");
4015
+ var CliHealthcheckHttpInputSchema = import_zod.z.object({
4016
+ type: import_zod.z.literal("http"),
4017
+ url: import_zod.z.string().min(1, "healthcheck URL is required"),
4018
+ timeout_seconds: import_zod.z.number().positive().optional(),
4019
+ timeoutSeconds: import_zod.z.number().positive().optional()
4020
+ });
4021
+ var CliHealthcheckCommandInputSchema = import_zod.z.object({
4022
+ type: import_zod.z.literal("command"),
4023
+ command_template: import_zod.z.string().optional(),
4024
+ commandTemplate: import_zod.z.string().optional(),
4025
+ cwd: import_zod.z.string().optional(),
4026
+ timeout_seconds: import_zod.z.number().positive().optional(),
4027
+ timeoutSeconds: import_zod.z.number().positive().optional()
4028
+ });
4029
+ var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
4030
+ CliHealthcheckHttpInputSchema,
4031
+ CliHealthcheckCommandInputSchema
4032
+ ]);
4033
+ var CliTargetInputSchema = import_zod.z.object({
4034
+ name: import_zod.z.string().min(1, "target name is required"),
4035
+ provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
4036
+ // Command template - required (accept both naming conventions)
4037
+ command_template: import_zod.z.string().optional(),
4038
+ commandTemplate: import_zod.z.string().optional(),
4039
+ // Files format - optional
4040
+ files_format: import_zod.z.string().optional(),
4041
+ filesFormat: import_zod.z.string().optional(),
4042
+ attachments_format: import_zod.z.string().optional(),
4043
+ attachmentsFormat: import_zod.z.string().optional(),
4044
+ // Working directory - optional
4045
+ cwd: import_zod.z.string().optional(),
4046
+ // Timeout in seconds - optional
4047
+ timeout_seconds: import_zod.z.number().positive().optional(),
4048
+ timeoutSeconds: import_zod.z.number().positive().optional(),
4049
+ // Healthcheck configuration - optional
4050
+ healthcheck: CliHealthcheckInputSchema.optional(),
4051
+ // Verbose mode - optional
4052
+ verbose: import_zod.z.boolean().optional(),
4053
+ cli_verbose: import_zod.z.boolean().optional(),
4054
+ cliVerbose: import_zod.z.boolean().optional(),
4055
+ // Keep temp files - optional
4056
+ keep_temp_files: import_zod.z.boolean().optional(),
4057
+ keepTempFiles: import_zod.z.boolean().optional(),
4058
+ keep_output_files: import_zod.z.boolean().optional(),
4059
+ keepOutputFiles: import_zod.z.boolean().optional(),
4060
+ // Common target fields
4061
+ judge_target: import_zod.z.string().optional(),
4062
+ workers: import_zod.z.number().int().min(1).optional(),
4063
+ provider_batching: import_zod.z.boolean().optional(),
4064
+ providerBatching: import_zod.z.boolean().optional()
4065
+ }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
4066
+ message: "Either command_template or commandTemplate is required"
4067
+ });
4068
+ var CliHealthcheckHttpSchema = import_zod.z.object({
4069
+ type: import_zod.z.literal("http"),
4070
+ url: import_zod.z.string().min(1),
4071
+ timeoutMs: import_zod.z.number().positive().optional()
4072
+ }).strict();
4073
+ var CliHealthcheckCommandSchema = import_zod.z.object({
4074
+ type: import_zod.z.literal("command"),
4075
+ commandTemplate: import_zod.z.string().min(1),
4076
+ cwd: import_zod.z.string().optional(),
4077
+ timeoutMs: import_zod.z.number().positive().optional()
4078
+ }).strict();
4079
+ var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
4080
+ CliHealthcheckHttpSchema,
4081
+ CliHealthcheckCommandSchema
4082
+ ]);
4083
+ var CliTargetConfigSchema = import_zod.z.object({
4084
+ commandTemplate: import_zod.z.string().min(1),
4085
+ filesFormat: import_zod.z.string().optional(),
4086
+ cwd: import_zod.z.string().optional(),
4087
+ timeoutMs: import_zod.z.number().positive().optional(),
4088
+ healthcheck: CliHealthcheckSchema.optional(),
4089
+ verbose: import_zod.z.boolean().optional(),
4090
+ keepTempFiles: import_zod.z.boolean().optional()
4091
+ }).strict();
4092
+ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4093
+ const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4094
+ const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
4095
+ if (input.type === "http") {
4096
+ const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
4097
+ return {
4098
+ type: "http",
4099
+ url,
4100
+ timeoutMs
4101
+ };
4102
+ }
4103
+ const commandTemplateSource = input.command_template ?? input.commandTemplate;
4104
+ if (commandTemplateSource === void 0) {
4105
+ throw new Error(
4106
+ `${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
4107
+ );
4108
+ }
4109
+ const commandTemplate = resolveString(
4110
+ commandTemplateSource,
4111
+ env,
4112
+ `${targetName} healthcheck command template`,
4113
+ true
4114
+ );
4115
+ let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
4116
+ allowLiteral: true,
4117
+ optionalEnv: true
4118
+ });
4119
+ if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4120
+ cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
4121
+ }
4122
+ return {
4123
+ type: "command",
4124
+ commandTemplate,
4125
+ cwd,
4126
+ timeoutMs
4127
+ };
4128
+ }
4129
+ function normalizeCliTargetInput(input, env, evalFilePath) {
4130
+ const targetName = input.name;
4131
+ const commandTemplateSource = input.command_template ?? input.commandTemplate;
4132
+ if (commandTemplateSource === void 0) {
4133
+ throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
4134
+ }
4135
+ const commandTemplate = resolveString(
4136
+ commandTemplateSource,
4137
+ env,
4138
+ `${targetName} CLI command template`,
4139
+ true
4140
+ );
4141
+ const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
4142
+ const filesFormat = resolveOptionalLiteralString(filesFormatSource);
4143
+ let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
4144
+ allowLiteral: true,
4145
+ optionalEnv: true
4146
+ });
4147
+ if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4148
+ cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
4149
+ }
4150
+ if (!cwd && evalFilePath) {
4151
+ cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
4152
+ }
4153
+ const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4154
+ const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
4155
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
4156
+ const keepTempFiles = resolveOptionalBoolean(
4157
+ input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
4158
+ );
4159
+ const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
4160
+ return {
4161
+ commandTemplate,
4162
+ filesFormat,
4163
+ cwd,
4164
+ timeoutMs,
4165
+ healthcheck,
4166
+ verbose,
4167
+ keepTempFiles
4168
+ };
4169
+ }
4170
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
4171
+ "PROMPT",
4172
+ "GUIDELINES",
4173
+ "EVAL_ID",
4174
+ "ATTEMPT",
4175
+ "FILES",
4176
+ "OUTPUT_FILE"
4177
+ ]);
4178
+ var BASE_TARGET_SCHEMA = import_zod.z.object({
4179
+ name: import_zod.z.string().min(1, "target name is required"),
4180
+ provider: import_zod.z.string().min(1, "provider is required"),
4181
+ judge_target: import_zod.z.string().optional(),
4182
+ workers: import_zod.z.number().int().min(1).optional()
4183
+ }).passthrough();
4184
+ var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
4185
+ function normalizeAzureApiVersion(value) {
4186
+ if (!value) {
4187
+ return DEFAULT_AZURE_API_VERSION;
4188
+ }
4189
+ const trimmed = value.trim();
4190
+ if (trimmed.length === 0) {
4191
+ return DEFAULT_AZURE_API_VERSION;
4192
+ }
4193
+ const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
4194
+ return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
4195
+ }
4196
+ function resolveRetryConfig(target) {
4197
+ const maxRetries = resolveOptionalNumber(
4198
+ target.max_retries ?? target.maxRetries,
4199
+ `${target.name} max retries`
4200
+ );
4201
+ const initialDelayMs = resolveOptionalNumber(
4202
+ target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
4203
+ `${target.name} retry initial delay`
4204
+ );
4205
+ const maxDelayMs = resolveOptionalNumber(
4206
+ target.retry_max_delay_ms ?? target.retryMaxDelayMs,
4207
+ `${target.name} retry max delay`
4208
+ );
4209
+ const backoffFactor = resolveOptionalNumber(
4210
+ target.retry_backoff_factor ?? target.retryBackoffFactor,
4211
+ `${target.name} retry backoff factor`
4212
+ );
4213
+ const retryableStatusCodes = resolveOptionalNumberArray(
4214
+ target.retry_status_codes ?? target.retryStatusCodes,
4215
+ `${target.name} retry status codes`
4216
+ );
4217
+ if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
4218
+ return void 0;
4219
+ }
4220
+ return {
4221
+ maxRetries,
4222
+ initialDelayMs,
4223
+ maxDelayMs,
4224
+ backoffFactor,
4225
+ retryableStatusCodes
4226
+ };
4227
+ }
4228
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
4229
+ const parsed = BASE_TARGET_SCHEMA.parse(definition);
4230
+ const provider = parsed.provider.toLowerCase();
4231
+ const providerBatching = resolveOptionalBoolean(
4232
+ parsed.provider_batching ?? parsed.providerBatching
4233
+ );
4234
+ switch (provider) {
4235
+ case "azure":
4236
+ case "azure-openai":
4237
+ return {
4238
+ kind: "azure",
4239
+ name: parsed.name,
4240
+ judgeTarget: parsed.judge_target,
4241
+ workers: parsed.workers,
4242
+ providerBatching,
4243
+ config: resolveAzureConfig(parsed, env)
4244
+ };
4245
+ case "anthropic":
4246
+ return {
4247
+ kind: "anthropic",
4248
+ name: parsed.name,
4249
+ judgeTarget: parsed.judge_target,
4250
+ workers: parsed.workers,
4251
+ providerBatching,
4252
+ config: resolveAnthropicConfig(parsed, env)
4253
+ };
4254
+ case "gemini":
4255
+ case "google":
4256
+ case "google-gemini":
4257
+ return {
4258
+ kind: "gemini",
4259
+ name: parsed.name,
4260
+ judgeTarget: parsed.judge_target,
4261
+ workers: parsed.workers,
4262
+ providerBatching,
4263
+ config: resolveGeminiConfig(parsed, env)
4264
+ };
4265
+ case "codex":
4266
+ case "codex-cli":
4267
+ return {
4268
+ kind: "codex",
4269
+ name: parsed.name,
4270
+ judgeTarget: parsed.judge_target,
4271
+ workers: parsed.workers,
4272
+ providerBatching,
4273
+ config: resolveCodexConfig(parsed, env)
4274
+ };
4275
+ case "pi":
4276
+ case "pi-coding-agent":
4277
+ return {
4278
+ kind: "pi-coding-agent",
4279
+ name: parsed.name,
4280
+ judgeTarget: parsed.judge_target,
4281
+ workers: parsed.workers,
4282
+ providerBatching,
4283
+ config: resolvePiCodingAgentConfig(parsed, env)
4284
+ };
4285
+ case "mock":
4286
+ return {
4287
+ kind: "mock",
4288
+ name: parsed.name,
4289
+ judgeTarget: parsed.judge_target,
4290
+ workers: parsed.workers,
4291
+ providerBatching,
4292
+ config: resolveMockConfig(parsed)
4293
+ };
4294
+ case "vscode":
4295
+ case "vscode-insiders":
4296
+ return {
4297
+ kind: provider,
4298
+ name: parsed.name,
4299
+ judgeTarget: parsed.judge_target,
4300
+ workers: parsed.workers,
4301
+ providerBatching,
4302
+ config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
4303
+ };
4304
+ case "cli":
4305
+ return {
4306
+ kind: "cli",
4307
+ name: parsed.name,
4308
+ judgeTarget: parsed.judge_target,
4309
+ workers: parsed.workers,
4310
+ providerBatching,
4311
+ config: resolveCliConfig(parsed, env, evalFilePath)
4312
+ };
4313
+ default:
4314
+ throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
4315
+ }
4316
+ }
4317
+ function resolveAzureConfig(target, env) {
4318
+ const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
4319
+ const apiKeySource = target.api_key ?? target.apiKey;
4320
+ const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
4321
+ const versionSource = target.version ?? target.api_version;
4322
+ const temperatureSource = target.temperature;
4323
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4324
+ const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
4325
+ const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
4326
+ const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
4327
+ const version = normalizeAzureApiVersion(
4328
+ resolveOptionalString(versionSource, env, `${target.name} api version`, {
4329
+ allowLiteral: true,
4330
+ optionalEnv: true
4331
+ })
4332
+ );
4333
+ const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
4334
+ const maxOutputTokens = resolveOptionalNumber(
4335
+ maxTokensSource,
4336
+ `${target.name} max output tokens`
4337
+ );
4338
+ const retry = resolveRetryConfig(target);
4339
+ return {
4340
+ resourceName,
4341
+ deploymentName,
4342
+ apiKey,
4343
+ version,
4344
+ temperature,
4345
+ maxOutputTokens,
4346
+ retry
4347
+ };
4348
+ }
4349
+ function resolveAnthropicConfig(target, env) {
4350
+ const apiKeySource = target.api_key ?? target.apiKey;
4351
+ const modelSource = target.model ?? target.deployment ?? target.variant;
4352
+ const temperatureSource = target.temperature;
4353
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4354
+ const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
4355
+ const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
4356
+ const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
4357
+ const retry = resolveRetryConfig(target);
4358
+ return {
4359
+ apiKey,
4360
+ model,
4361
+ temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
4362
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
4363
+ thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
4364
+ retry
4365
+ };
4366
+ }
4367
+ function resolveGeminiConfig(target, env) {
4368
+ const apiKeySource = target.api_key ?? target.apiKey;
4369
+ const modelSource = target.model ?? target.deployment ?? target.variant;
4370
+ const temperatureSource = target.temperature;
4371
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4372
+ const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
4373
+ const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
4374
+ allowLiteral: true,
4375
+ optionalEnv: true
4376
+ }) ?? "gemini-2.5-flash";
4377
+ const retry = resolveRetryConfig(target);
4378
+ return {
4379
+ apiKey,
4380
+ model,
4381
+ temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
4382
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
4383
+ retry
4384
+ };
4385
+ }
4386
+ function resolveCodexConfig(target, env) {
4387
+ const executableSource = target.executable ?? target.command ?? target.binary;
4388
+ const argsSource = target.args ?? target.arguments;
4389
+ const cwdSource = target.cwd;
4390
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
3429
4391
  const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
3430
4392
  const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
4393
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
3431
4394
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
3432
4395
  allowLiteral: true,
3433
4396
  optionalEnv: true
@@ -3443,13 +4406,15 @@ function resolveCodexConfig(target, env) {
3443
4406
  optionalEnv: true
3444
4407
  });
3445
4408
  const logFormat = normalizeCodexLogFormat(logFormatSource);
4409
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
3446
4410
  return {
3447
4411
  executable,
3448
4412
  args,
3449
4413
  cwd,
3450
4414
  timeoutMs,
3451
4415
  logDir,
3452
- logFormat
4416
+ logFormat,
4417
+ systemPrompt
3453
4418
  };
3454
4419
  }
3455
4420
  function normalizeCodexLogFormat(value) {
@@ -3465,6 +4430,70 @@ function normalizeCodexLogFormat(value) {
3465
4430
  }
3466
4431
  throw new Error("codex log format must be 'summary' or 'json'");
3467
4432
  }
4433
+ function resolvePiCodingAgentConfig(target, env) {
4434
+ const executableSource = target.executable ?? target.command ?? target.binary;
4435
+ const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
4436
+ const modelSource = target.model ?? target.pi_model ?? target.piModel;
4437
+ const apiKeySource = target.api_key ?? target.apiKey;
4438
+ const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
4439
+ const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
4440
+ const argsSource = target.args ?? target.arguments;
4441
+ const cwdSource = target.cwd;
4442
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
4443
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
4444
+ const logFormatSource = target.log_format ?? target.logFormat;
4445
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
4446
+ const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, {
4447
+ allowLiteral: true,
4448
+ optionalEnv: true
4449
+ }) ?? "pi";
4450
+ const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
4451
+ allowLiteral: true,
4452
+ optionalEnv: true
4453
+ });
4454
+ const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
4455
+ allowLiteral: true,
4456
+ optionalEnv: true
4457
+ });
4458
+ const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, {
4459
+ allowLiteral: false,
4460
+ optionalEnv: true
4461
+ });
4462
+ const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
4463
+ allowLiteral: true,
4464
+ optionalEnv: true
4465
+ });
4466
+ const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
4467
+ allowLiteral: true,
4468
+ optionalEnv: true
4469
+ });
4470
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
4471
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
4472
+ allowLiteral: true,
4473
+ optionalEnv: true
4474
+ });
4475
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
4476
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
4477
+ allowLiteral: true,
4478
+ optionalEnv: true
4479
+ });
4480
+ const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
4481
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
4482
+ return {
4483
+ executable,
4484
+ provider,
4485
+ model,
4486
+ apiKey,
4487
+ tools,
4488
+ thinking,
4489
+ args,
4490
+ cwd,
4491
+ timeoutMs,
4492
+ logDir,
4493
+ logFormat,
4494
+ systemPrompt
4495
+ };
4496
+ }
3468
4497
  function resolveMockConfig(target) {
3469
4498
  const response = typeof target.response === "string" ? target.response : void 0;
3470
4499
  return { response };
@@ -3499,46 +4528,35 @@ function resolveVSCodeConfig(target, env, insiders) {
3499
4528
  workspaceTemplate
3500
4529
  };
3501
4530
  }
3502
- function resolveCliConfig(target, env, evalFilePath) {
3503
- const commandTemplateSource = target.command_template ?? target.commandTemplate;
3504
- const filesFormat = resolveOptionalLiteralString(
3505
- target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3506
- );
3507
- const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
3508
- const keepTempFiles = resolveOptionalBoolean(
3509
- target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
3510
- );
3511
- let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3512
- allowLiteral: true,
3513
- optionalEnv: true
3514
- });
3515
- if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
3516
- cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
4531
+ var cliErrorMap = (issue, ctx) => {
4532
+ if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
4533
+ return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
3517
4534
  }
3518
- if (!cwd && evalFilePath) {
3519
- cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
4535
+ if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
4536
+ return { message: "healthcheck type must be 'http' or 'command'" };
3520
4537
  }
3521
- const timeoutMs = resolveTimeoutMs(
3522
- target.timeout_seconds ?? target.timeoutSeconds,
3523
- `${target.name} timeout`
3524
- );
3525
- const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
3526
- const commandTemplate = resolveString(
3527
- commandTemplateSource,
3528
- env,
3529
- `${target.name} CLI command template`,
3530
- true
3531
- );
3532
- assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
3533
- return {
3534
- commandTemplate,
3535
- filesFormat,
3536
- cwd,
3537
- timeoutMs,
3538
- healthcheck,
3539
- verbose,
3540
- keepTempFiles
3541
- };
4538
+ if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
4539
+ return { message: `${ctx.defaultError} (expected a string value)` };
4540
+ }
4541
+ return { message: ctx.defaultError };
4542
+ };
4543
+ function resolveCliConfig(target, env, evalFilePath) {
4544
+ const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
4545
+ if (!parseResult.success) {
4546
+ const firstError = parseResult.error.errors[0];
4547
+ const path16 = firstError?.path.join(".") || "";
4548
+ const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
4549
+ throw new Error(`${prefix}${firstError?.message}`);
4550
+ }
4551
+ const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
4552
+ assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
4553
+ if (normalized.healthcheck?.type === "command") {
4554
+ assertSupportedCliPlaceholders(
4555
+ normalized.healthcheck.commandTemplate,
4556
+ `${target.name} healthcheck command template`
4557
+ );
4558
+ }
4559
+ return normalized;
3542
4560
  }
3543
4561
  function resolveTimeoutMs(source, description) {
3544
4562
  const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
@@ -3550,49 +4568,6 @@ function resolveTimeoutMs(source, description) {
3550
4568
  }
3551
4569
  return Math.floor(seconds * 1e3);
3552
4570
  }
3553
- function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
3554
- if (source === void 0 || source === null) {
3555
- return void 0;
3556
- }
3557
- if (typeof source !== "object" || Array.isArray(source)) {
3558
- throw new Error(`${targetName} healthcheck must be an object`);
3559
- }
3560
- const candidate = source;
3561
- const type = candidate.type;
3562
- const timeoutMs = resolveTimeoutMs(
3563
- candidate.timeout_seconds ?? candidate.timeoutSeconds,
3564
- `${targetName} healthcheck timeout`
3565
- );
3566
- if (type === "http") {
3567
- const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
3568
- return {
3569
- type: "http",
3570
- url,
3571
- timeoutMs
3572
- };
3573
- }
3574
- if (type === "command") {
3575
- const commandTemplate = resolveString(
3576
- candidate.command_template ?? candidate.commandTemplate,
3577
- env,
3578
- `${targetName} healthcheck command template`,
3579
- true
3580
- );
3581
- assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
3582
- const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
3583
- allowLiteral: true,
3584
- optionalEnv: true
3585
- });
3586
- const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
3587
- return {
3588
- type: "command",
3589
- commandTemplate,
3590
- timeoutMs,
3591
- cwd: resolvedCwd
3592
- };
3593
- }
3594
- throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
3595
- }
3596
4571
  function assertSupportedCliPlaceholders(template, description) {
3597
4572
  const placeholders = extractCliPlaceholders(template);
3598
4573
  for (const placeholder of placeholders) {
@@ -3758,7 +4733,7 @@ function resolveOptionalNumberArray(source, description) {
3758
4733
  }
3759
4734
 
3760
4735
  // src/evaluation/providers/vscode.ts
3761
- var import_node_path12 = __toESM(require("path"), 1);
4736
+ var import_node_path13 = __toESM(require("path"), 1);
3762
4737
  var import_subagent = require("subagent");
3763
4738
 
3764
4739
  // src/evaluation/providers/vscode-templates.ts
@@ -3928,7 +4903,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3928
4903
  return "";
3929
4904
  }
3930
4905
  const buildList = (files) => files.map((absolutePath) => {
3931
- const fileName = import_node_path12.default.basename(absolutePath);
4906
+ const fileName = import_node_path13.default.basename(absolutePath);
3932
4907
  const fileUri = pathToFileUri2(absolutePath);
3933
4908
  return `* [${fileName}](${fileUri})`;
3934
4909
  });
@@ -3953,8 +4928,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3953
4928
  }
3954
4929
  const unique = /* @__PURE__ */ new Map();
3955
4930
  for (const attachment of attachments) {
3956
- const absolutePath = import_node_path12.default.resolve(attachment);
3957
- const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
4931
+ const absolutePath = import_node_path13.default.resolve(attachment);
4932
+ const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
3958
4933
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3959
4934
  if (!unique.has(absolutePath)) {
3960
4935
  unique.set(absolutePath, absolutePath);
@@ -3969,7 +4944,7 @@ function collectAttachmentFiles(attachments) {
3969
4944
  }
3970
4945
  const unique = /* @__PURE__ */ new Map();
3971
4946
  for (const attachment of attachments) {
3972
- const absolutePath = import_node_path12.default.resolve(attachment);
4947
+ const absolutePath = import_node_path13.default.resolve(attachment);
3973
4948
  if (!unique.has(absolutePath)) {
3974
4949
  unique.set(absolutePath, absolutePath);
3975
4950
  }
@@ -3977,7 +4952,7 @@ function collectAttachmentFiles(attachments) {
3977
4952
  return Array.from(unique.values());
3978
4953
  }
3979
4954
  function pathToFileUri2(filePath) {
3980
- const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
4955
+ const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
3981
4956
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3982
4957
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3983
4958
  return `file:///${normalizedPath}`;
@@ -3990,7 +4965,7 @@ function normalizeAttachments(attachments) {
3990
4965
  }
3991
4966
  const deduped = /* @__PURE__ */ new Set();
3992
4967
  for (const attachment of attachments) {
3993
- deduped.add(import_node_path12.default.resolve(attachment));
4968
+ deduped.add(import_node_path13.default.resolve(attachment));
3994
4969
  }
3995
4970
  return Array.from(deduped);
3996
4971
  }
@@ -3999,7 +4974,7 @@ function mergeAttachments(all) {
3999
4974
  for (const list of all) {
4000
4975
  if (!list) continue;
4001
4976
  for (const inputFile of list) {
4002
- deduped.add(import_node_path12.default.resolve(inputFile));
4977
+ deduped.add(import_node_path13.default.resolve(inputFile));
4003
4978
  }
4004
4979
  }
4005
4980
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -4046,9 +5021,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
4046
5021
  }
4047
5022
 
4048
5023
  // src/evaluation/providers/targets-file.ts
4049
- var import_node_fs4 = require("fs");
4050
- var import_promises10 = require("fs/promises");
4051
- var import_node_path13 = __toESM(require("path"), 1);
5024
+ var import_node_fs5 = require("fs");
5025
+ var import_promises11 = require("fs/promises");
5026
+ var import_node_path14 = __toESM(require("path"), 1);
4052
5027
  var import_yaml3 = require("yaml");
4053
5028
  function isRecord(value) {
4054
5029
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4078,18 +5053,18 @@ function assertTargetDefinition(value, index, filePath) {
4078
5053
  }
4079
5054
  async function fileExists3(filePath) {
4080
5055
  try {
4081
- await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
5056
+ await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
4082
5057
  return true;
4083
5058
  } catch {
4084
5059
  return false;
4085
5060
  }
4086
5061
  }
4087
5062
  async function readTargetDefinitions(filePath) {
4088
- const absolutePath = import_node_path13.default.resolve(filePath);
5063
+ const absolutePath = import_node_path14.default.resolve(filePath);
4089
5064
  if (!await fileExists3(absolutePath)) {
4090
5065
  throw new Error(`targets.yaml not found at ${absolutePath}`);
4091
5066
  }
4092
- const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
5067
+ const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
4093
5068
  const parsed = (0, import_yaml3.parse)(raw);
4094
5069
  if (!isRecord(parsed)) {
4095
5070
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -4117,6 +5092,8 @@ function createProvider(target) {
4117
5092
  return new CliProvider(target.name, target.config);
4118
5093
  case "codex":
4119
5094
  return new CodexProvider(target.name, target.config);
5095
+ case "pi-coding-agent":
5096
+ return new PiCodingAgentProvider(target.name, target.config);
4120
5097
  case "mock":
4121
5098
  return new MockProvider(target.name, target.config);
4122
5099
  case "vscode":
@@ -4137,9 +5114,76 @@ function resolveAndCreateProvider(definition, env = process.env) {
4137
5114
  var import_ai2 = require("ai");
4138
5115
  var import_zod2 = require("zod");
4139
5116
 
5117
+ // src/runtime/exec.ts
5118
+ function getBunSpawn() {
5119
+ const bunSpawn = globalThis.Bun?.spawn;
5120
+ return typeof bunSpawn === "function" ? bunSpawn : void 0;
5121
+ }
5122
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
5123
+ const bunSpawn = getBunSpawn();
5124
+ if (bunSpawn) {
5125
+ const encoder = new TextEncoder();
5126
+ const proc = bunSpawn({
5127
+ cmd: ["sh", "-c", command],
5128
+ cwd: options.cwd,
5129
+ stdin: encoder.encode(stdinPayload),
5130
+ stdout: "pipe",
5131
+ stderr: "pipe"
5132
+ });
5133
+ const timeout = options.timeoutMs ? setTimeout(() => {
5134
+ proc.kill();
5135
+ }, options.timeoutMs) : void 0;
5136
+ try {
5137
+ const stdout = await new Response(proc.stdout).text();
5138
+ const stderr = await new Response(proc.stderr).text();
5139
+ const exitCode = await proc.exited;
5140
+ return { stdout, stderr, exitCode };
5141
+ } finally {
5142
+ if (timeout !== void 0) {
5143
+ clearTimeout(timeout);
5144
+ }
5145
+ }
5146
+ }
5147
+ const { spawn: spawn3 } = await import("child_process");
5148
+ return await new Promise((resolve, reject) => {
5149
+ const child = spawn3(command, {
5150
+ shell: true,
5151
+ cwd: options.cwd,
5152
+ stdio: ["pipe", "pipe", "pipe"]
5153
+ });
5154
+ let stdout = "";
5155
+ let stderr = "";
5156
+ const timeout = options.timeoutMs ? setTimeout(() => {
5157
+ child.kill();
5158
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
5159
+ }, options.timeoutMs) : void 0;
5160
+ child.stdout?.on("data", (data) => {
5161
+ stdout += data.toString();
5162
+ });
5163
+ child.stderr?.on("data", (data) => {
5164
+ stderr += data.toString();
5165
+ });
5166
+ child.on("error", (error) => {
5167
+ if (timeout !== void 0) {
5168
+ clearTimeout(timeout);
5169
+ }
5170
+ reject(error);
5171
+ });
5172
+ child.on("exit", (code) => {
5173
+ if (timeout !== void 0) {
5174
+ clearTimeout(timeout);
5175
+ }
5176
+ resolve({ stdout, stderr, exitCode: code ?? 0 });
5177
+ });
5178
+ child.stdin?.write(stdinPayload);
5179
+ child.stdin?.end();
5180
+ });
5181
+ }
5182
+
4140
5183
  // src/evaluation/providers/types.ts
4141
5184
  var AGENT_PROVIDER_KINDS = [
4142
5185
  "codex",
5186
+ "pi-coding-agent",
4143
5187
  "vscode",
4144
5188
  "vscode-insiders"
4145
5189
  ];
@@ -4438,17 +5482,17 @@ var CodeEvaluator = class {
4438
5482
  const inputPayload = JSON.stringify(
4439
5483
  {
4440
5484
  question: context.evalCase.question,
4441
- expected_outcome: context.evalCase.expected_outcome,
4442
- expected_messages: context.evalCase.expected_messages,
4443
- reference_answer: context.evalCase.reference_answer,
4444
- candidate_answer: context.candidate,
4445
- output_messages: context.outputMessages ?? null,
4446
- guideline_files: context.evalCase.guideline_paths,
4447
- input_files: context.evalCase.file_paths.filter(
4448
- (path15) => !context.evalCase.guideline_paths.includes(path15)
5485
+ expectedOutcome: context.evalCase.expected_outcome,
5486
+ expectedMessages: context.evalCase.expected_messages,
5487
+ referenceAnswer: context.evalCase.reference_answer,
5488
+ candidateAnswer: context.candidate,
5489
+ outputMessages: context.outputMessages ?? null,
5490
+ guidelineFiles: context.evalCase.guideline_paths,
5491
+ inputFiles: context.evalCase.file_paths.filter(
5492
+ (path16) => !context.evalCase.guideline_paths.includes(path16)
4449
5493
  ),
4450
- input_messages: context.evalCase.input_messages,
4451
- candidate_trace_summary: context.traceSummary ?? null
5494
+ inputMessages: context.evalCase.input_messages,
5495
+ traceSummary: context.traceSummary ?? null
4452
5496
  },
4453
5497
  null,
4454
5498
  2
@@ -4518,43 +5562,17 @@ function calculateRubricScore(result, rubrics) {
4518
5562
  return { score, verdict, hits, misses };
4519
5563
  }
4520
5564
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
4521
- const { spawn: spawn2 } = await import("child_process");
4522
- return await new Promise((resolve, reject) => {
4523
- const child = spawn2(scriptPath, {
4524
- shell: true,
4525
- cwd
4526
- });
4527
- let stdout = "";
4528
- let stderr = "";
4529
- const timeout = agentTimeoutMs ? setTimeout(() => {
4530
- child.kill();
4531
- reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
4532
- }, agentTimeoutMs) : void 0;
4533
- child.stdout?.on("data", (data) => {
4534
- stdout += data.toString();
4535
- });
4536
- child.stderr?.on("data", (data) => {
4537
- stderr += data.toString();
4538
- });
4539
- child.on("error", (error) => {
4540
- if (timeout !== void 0) {
4541
- clearTimeout(timeout);
4542
- }
4543
- reject(error);
4544
- });
4545
- child.on("exit", (code) => {
4546
- if (timeout !== void 0) {
4547
- clearTimeout(timeout);
4548
- }
4549
- if (code && code !== 0 && stderr.length > 0) {
4550
- reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
4551
- return;
4552
- }
4553
- resolve(stdout.trim());
4554
- });
4555
- child.stdin?.write(input);
4556
- child.stdin?.end();
5565
+ const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
5566
+ cwd,
5567
+ timeoutMs: agentTimeoutMs
4557
5568
  });
5569
+ if (exitCode !== 0) {
5570
+ const trimmedErr = stderr.trim();
5571
+ throw new Error(
5572
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5573
+ );
5574
+ }
5575
+ return stdout.trim();
4558
5576
  }
4559
5577
  function parseJsonSafe(payload) {
4560
5578
  try {
@@ -4568,6 +5586,33 @@ function substituteVariables(template, variables) {
4568
5586
  return variables[varName] ?? match;
4569
5587
  });
4570
5588
  }
5589
+ function deepEqual(a, b) {
5590
+ if (a === b) return true;
5591
+ if (a === null || b === null) return a === b;
5592
+ if (typeof a !== typeof b) return false;
5593
+ if (typeof a !== "object") return a === b;
5594
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
5595
+ if (Array.isArray(a) && Array.isArray(b)) {
5596
+ if (a.length !== b.length) return false;
5597
+ return a.every((val, i) => deepEqual(val, b[i]));
5598
+ }
5599
+ const aObj = a;
5600
+ const bObj = b;
5601
+ const aKeys = Object.keys(aObj);
5602
+ const bKeys = Object.keys(bObj);
5603
+ if (aKeys.length !== bKeys.length) return false;
5604
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5605
+ }
5606
+ function argsMatch(expected, actual) {
5607
+ if (expected === void 0) return true;
5608
+ if (expected === "any") return true;
5609
+ if (actual === void 0) return false;
5610
+ for (const key of Object.keys(expected)) {
5611
+ if (!Object.hasOwn(actual, key)) return false;
5612
+ if (!deepEqual(expected[key], actual[key])) return false;
5613
+ }
5614
+ return true;
5615
+ }
4571
5616
  var ToolTrajectoryEvaluator = class {
4572
5617
  kind = "tool_trajectory";
4573
5618
  config;
@@ -4624,7 +5669,10 @@ var ToolTrajectoryEvaluator = class {
4624
5669
  for (const message of messages) {
4625
5670
  if (message.toolCalls) {
4626
5671
  for (const call of message.toolCalls) {
4627
- toolCalls.push({ name: call.tool });
5672
+ toolCalls.push({
5673
+ name: call.tool,
5674
+ args: call.input
5675
+ });
4628
5676
  }
4629
5677
  }
4630
5678
  }
@@ -4693,18 +5741,29 @@ var ToolTrajectoryEvaluator = class {
4693
5741
  const misses = [];
4694
5742
  let actualIndex = 0;
4695
5743
  for (let i = 0; i < expected.length; i++) {
4696
- const expectedTool = expected[i].tool;
5744
+ const expectedItem = expected[i];
5745
+ const expectedTool = expectedItem.tool;
4697
5746
  let found = false;
5747
+ let argsMismatch = false;
4698
5748
  while (actualIndex < toolCalls.length) {
4699
- if (toolCalls[actualIndex].name === expectedTool) {
4700
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
5749
+ const actualCall = toolCalls[actualIndex];
5750
+ if (actualCall.name === expectedTool) {
5751
+ if (argsMatch(expectedItem.args, actualCall.args)) {
5752
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
5753
+ actualIndex++;
5754
+ found = true;
5755
+ break;
5756
+ }
5757
+ misses.push(
5758
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
5759
+ );
4701
5760
  actualIndex++;
4702
- found = true;
5761
+ argsMismatch = true;
4703
5762
  break;
4704
5763
  }
4705
5764
  actualIndex++;
4706
5765
  }
4707
- if (!found) {
5766
+ if (!found && !argsMismatch) {
4708
5767
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
4709
5768
  }
4710
5769
  }
@@ -4735,10 +5794,16 @@ var ToolTrajectoryEvaluator = class {
4735
5794
  }
4736
5795
  const checkLength = Math.min(expected.length, toolCalls.length);
4737
5796
  for (let i = 0; i < checkLength; i++) {
4738
- const expectedTool = expected[i].tool;
4739
- const actualTool = toolCalls[i].name;
5797
+ const expectedItem = expected[i];
5798
+ const expectedTool = expectedItem.tool;
5799
+ const actualCall = toolCalls[i];
5800
+ const actualTool = actualCall.name;
4740
5801
  if (actualTool === expectedTool) {
4741
- hits.push(`Position ${i}: ${expectedTool} \u2713`);
5802
+ if (argsMatch(expectedItem.args, actualCall.args)) {
5803
+ hits.push(`Position ${i}: ${expectedTool}`);
5804
+ } else {
5805
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
5806
+ }
4742
5807
  } else {
4743
5808
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4744
5809
  }
@@ -4982,9 +6047,9 @@ var CompositeEvaluator = class {
4982
6047
  };
4983
6048
 
4984
6049
  // src/evaluation/orchestrator.ts
4985
- var import_node_crypto2 = require("crypto");
4986
- var import_promises11 = require("fs/promises");
4987
- var import_node_path14 = __toESM(require("path"), 1);
6050
+ var import_node_crypto3 = require("crypto");
6051
+ var import_promises12 = require("fs/promises");
6052
+ var import_node_path15 = __toESM(require("path"), 1);
4988
6053
 
4989
6054
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4990
6055
  var Node = class {
@@ -5380,7 +6445,12 @@ async function runBatchEvaluation(options) {
5380
6445
  const promptInputs = promptInputsList[i];
5381
6446
  const providerResponse = batchResponse[i];
5382
6447
  const outputMessages = providerResponse.outputMessages;
5383
- const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6448
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6449
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6450
+ tokenUsage: providerResponse.tokenUsage,
6451
+ costUsd: providerResponse.costUsd,
6452
+ durationMs: providerResponse.durationMs
6453
+ }) : void 0;
5384
6454
  const candidate = extractLastAssistantContent(outputMessages);
5385
6455
  let result;
5386
6456
  try {
@@ -5501,7 +6571,12 @@ async function runEvalCase(options) {
5501
6571
  await cache.set(cacheKey, providerResponse);
5502
6572
  }
5503
6573
  const outputMessages = providerResponse.outputMessages;
5504
- const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6574
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6575
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6576
+ tokenUsage: providerResponse.tokenUsage,
6577
+ costUsd: providerResponse.costUsd,
6578
+ durationMs: providerResponse.durationMs
6579
+ }) : void 0;
5505
6580
  const candidate = extractLastAssistantContent(outputMessages);
5506
6581
  try {
5507
6582
  return await evaluateCandidate({
@@ -5574,21 +6649,21 @@ async function evaluateCandidate(options) {
5574
6649
  }
5575
6650
  return {
5576
6651
  timestamp: completedAt.toISOString(),
5577
- eval_id: evalCase.id,
6652
+ evalId: evalCase.id,
5578
6653
  dataset: evalCase.dataset,
5579
- conversation_id: evalCase.conversation_id,
6654
+ conversationId: evalCase.conversation_id,
5580
6655
  score: score.score,
5581
6656
  hits: score.hits,
5582
6657
  misses: score.misses,
5583
- candidate_answer: candidate,
6658
+ candidateAnswer: candidate,
5584
6659
  target: target.name,
5585
6660
  reasoning: score.reasoning,
5586
- raw_aspects: score.rawAspects,
5587
- agent_provider_request: agentProviderRequest,
5588
- lm_provider_request: lmProviderRequest,
5589
- evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5590
- evaluator_results: evaluatorResults,
5591
- trace_summary: traceSummary
6661
+ rawAspects: score.rawAspects,
6662
+ agentProviderRequest,
6663
+ lmProviderRequest,
6664
+ evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
6665
+ evaluatorResults,
6666
+ traceSummary
5592
6667
  };
5593
6668
  }
5594
6669
  async function runEvaluatorsForCase(options) {
@@ -5686,7 +6761,7 @@ async function runEvaluatorList(options) {
5686
6761
  hits: score2.hits,
5687
6762
  misses: score2.misses,
5688
6763
  reasoning: score2.reasoning,
5689
- evaluator_provider_request: score2.evaluatorRawRequest
6764
+ evaluatorProviderRequest: score2.evaluatorRawRequest
5690
6765
  });
5691
6766
  }
5692
6767
  if (evaluator.type === "code") {
@@ -5717,11 +6792,11 @@ async function runEvaluatorList(options) {
5717
6792
  hits: score2.hits,
5718
6793
  misses: score2.misses,
5719
6794
  reasoning: score2.reasoning,
5720
- evaluator_provider_request: score2.evaluatorRawRequest
6795
+ evaluatorProviderRequest: score2.evaluatorRawRequest
5721
6796
  });
5722
6797
  }
5723
6798
  if (evaluator.type === "composite") {
5724
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
6799
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5725
6800
  const createEvaluator = (memberConfig) => {
5726
6801
  switch (memberConfig.type) {
5727
6802
  case "llm_judge":
@@ -5774,8 +6849,8 @@ async function runEvaluatorList(options) {
5774
6849
  hits: score2.hits,
5775
6850
  misses: score2.misses,
5776
6851
  reasoning: score2.reasoning,
5777
- evaluator_provider_request: score2.evaluatorRawRequest,
5778
- evaluator_results: mapChildResults(score2.evaluatorResults)
6852
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
6853
+ evaluatorResults: mapChildResults(score2.evaluatorResults)
5779
6854
  });
5780
6855
  }
5781
6856
  if (evaluator.type === "tool_trajectory") {
@@ -5933,22 +7008,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5933
7008
  async function dumpPrompt(directory, evalCase, promptInputs) {
5934
7009
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5935
7010
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5936
- const filePath = import_node_path14.default.resolve(directory, filename);
5937
- await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
7011
+ const filePath = import_node_path15.default.resolve(directory, filename);
7012
+ await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
5938
7013
  const payload = {
5939
7014
  eval_id: evalCase.id,
5940
7015
  question: promptInputs.question,
5941
7016
  guidelines: promptInputs.guidelines,
5942
7017
  guideline_paths: evalCase.guideline_paths
5943
7018
  };
5944
- await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
7019
+ await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
5945
7020
  }
5946
7021
  function sanitizeFilename(value) {
5947
7022
  if (!value) {
5948
7023
  return "prompt";
5949
7024
  }
5950
7025
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
5951
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
7026
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
5952
7027
  }
5953
7028
  async function invokeProvider(provider, options) {
5954
7029
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -6005,22 +7080,22 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
6005
7080
  }
6006
7081
  return {
6007
7082
  timestamp: timestamp.toISOString(),
6008
- eval_id: evalCase.id,
7083
+ evalId: evalCase.id,
6009
7084
  dataset: evalCase.dataset,
6010
- conversation_id: evalCase.conversation_id,
7085
+ conversationId: evalCase.conversation_id,
6011
7086
  score: 0,
6012
7087
  hits: [],
6013
7088
  misses: [`Error: ${message}`],
6014
- candidate_answer: `Error occurred: ${message}`,
7089
+ candidateAnswer: `Error occurred: ${message}`,
6015
7090
  target: targetName,
6016
- raw_aspects: [],
6017
- agent_provider_request: agentProviderRequest,
6018
- lm_provider_request: lmProviderRequest,
7091
+ rawAspects: [],
7092
+ agentProviderRequest,
7093
+ lmProviderRequest,
6019
7094
  error: message
6020
7095
  };
6021
7096
  }
6022
7097
  function createCacheKey(provider, target, evalCase, promptInputs) {
6023
- const hash = (0, import_node_crypto2.createHash)("sha256");
7098
+ const hash = (0, import_node_crypto3.createHash)("sha256");
6024
7099
  hash.update(provider.id);
6025
7100
  hash.update(target.name);
6026
7101
  hash.update(evalCase.id);
@@ -6060,8 +7135,8 @@ function mapChildResults(children) {
6060
7135
  hits: child.hits,
6061
7136
  misses: child.misses,
6062
7137
  reasoning: child.reasoning,
6063
- evaluator_provider_request: child.evaluatorRawRequest,
6064
- evaluator_results: mapChildResults(child.evaluatorResults)
7138
+ evaluatorProviderRequest: child.evaluatorRawRequest,
7139
+ evaluatorResults: mapChildResults(child.evaluatorResults)
6065
7140
  }));
6066
7141
  }
6067
7142
  function computeWeightedMean(entries) {
@@ -6163,17 +7238,21 @@ function createAgentKernel() {
6163
7238
  0 && (module.exports = {
6164
7239
  CodeEvaluator,
6165
7240
  CompositeEvaluator,
7241
+ DEFAULT_EXPLORATION_TOOLS,
6166
7242
  LlmJudgeEvaluator,
6167
7243
  TEST_MESSAGE_ROLES,
6168
7244
  ToolTrajectoryEvaluator,
7245
+ avgToolDurationMs,
6169
7246
  buildDirectoryChain,
6170
7247
  buildPromptInputs,
6171
7248
  buildSearchRoots,
6172
7249
  computeTraceSummary,
6173
7250
  consumeCodexLogEntries,
7251
+ consumePiLogEntries,
6174
7252
  createAgentKernel,
6175
7253
  createProvider,
6176
7254
  ensureVSCodeSubagents,
7255
+ explorationRatio,
6177
7256
  extractCodeBlocks,
6178
7257
  fileExists,
6179
7258
  findGitRoot,
@@ -6187,6 +7266,7 @@ function createAgentKernel() {
6187
7266
  isTestMessageRole,
6188
7267
  listTargetNames,
6189
7268
  loadEvalCases,
7269
+ mergeExecutionMetrics,
6190
7270
  normalizeLineEndings,
6191
7271
  readJsonFile,
6192
7272
  readTargetDefinitions,
@@ -6197,6 +7277,8 @@ function createAgentKernel() {
6197
7277
  resolveTargetDefinition,
6198
7278
  runEvalCase,
6199
7279
  runEvaluation,
6200
- subscribeToCodexLogEntries
7280
+ subscribeToCodexLogEntries,
7281
+ subscribeToPiLogEntries,
7282
+ tokensPerTool
6201
7283
  });
6202
7284
  //# sourceMappingURL=index.cjs.map