@agentv/core 4.25.1 → 4.25.2-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,14 +17,19 @@ import {
17
17
  readTextFile,
18
18
  resolveDelegatedTargetDefinition,
19
19
  resolveTargetDefinition
20
- } from "./chunk-6HLBKYE2.js";
20
+ } from "./chunk-CALQDF2Y.js";
21
21
  import {
22
22
  execFileWithStdin,
23
23
  execShellWithStdin
24
24
  } from "./chunk-3WGHC7LC.js";
25
25
  import {
26
- AgentvProvider
27
- } from "./chunk-PRNXHNLF.js";
26
+ AgentvProvider,
27
+ AnthropicProvider,
28
+ AzureProvider,
29
+ GeminiProvider,
30
+ OpenAIProvider,
31
+ OpenRouterProvider
32
+ } from "./chunk-5XV3FAAD.js";
28
33
 
29
34
  // src/evaluation/loaders/ts-eval-loader.ts
30
35
  import path46 from "node:path";
@@ -730,6 +735,8 @@ var CodeGrader = class {
730
735
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
731
736
  try {
732
737
  let stdout;
738
+ let exitCode = 0;
739
+ let execStderr = "";
733
740
  if (context.dockerConfig) {
734
741
  const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27.js");
735
742
  const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig);
@@ -738,31 +745,40 @@ var CodeGrader = class {
738
745
  stdin: inputPayload,
739
746
  repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos)
740
747
  });
741
- if (result.exitCode !== 0) {
742
- const trimmedErr = result.stderr.trim();
743
- throw new Error(
744
- trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
745
- );
746
- }
748
+ exitCode = result.exitCode;
747
749
  stdout = result.stdout.trim();
750
+ execStderr = result.stderr;
748
751
  } else {
749
- stdout = await executeScript(
752
+ const result = await runScriptRaw(
750
753
  this.command,
751
754
  inputPayload,
752
755
  this.agentTimeoutMs,
753
756
  this.cwd,
754
757
  env
755
758
  );
759
+ exitCode = result.exitCode;
760
+ stdout = result.stdout.trim();
761
+ execStderr = result.stderr;
756
762
  }
757
- const parsed = parseJsonSafe(stdout);
758
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
759
- const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
763
+ const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
764
+ const hasStderr = execStderr.trim().length > 0;
765
+ if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
766
+ const trimmedErr = formatStderr(execStderr);
767
+ throw new Error(
768
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
769
+ );
770
+ }
771
+ const rawParsed = parseJsonSafe(stdout);
772
+ const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
773
+ const passed = exitCode === 0;
774
+ const score = parsed != null ? clampScore(typeof parsed.score === "number" ? parsed.score : 0) : passed ? 1 : 0;
775
+ const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
760
776
  (a) => typeof a === "object" && a !== null && typeof a.text === "string"
761
777
  ).map((a) => ({
762
778
  text: String(a.text),
763
779
  passed: Boolean(a.passed),
764
780
  ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
765
- })) : [];
781
+ })) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
766
782
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
767
783
  const proxyUsage = getProxyUsage?.();
768
784
  const graderRawRequest = {
@@ -820,8 +836,17 @@ var CodeGrader = class {
820
836
  }
821
837
  }
822
838
  };
839
+ async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
840
+ return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
841
+ }
823
842
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
824
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
843
+ const { stdout, stderr, exitCode } = await runScriptRaw(
844
+ scriptPath,
845
+ input,
846
+ agentTimeoutMs,
847
+ cwd,
848
+ env
849
+ );
825
850
  if (exitCode !== 0) {
826
851
  const trimmedErr = formatStderr(stderr);
827
852
  throw new Error(
@@ -841,13 +866,9 @@ function formatStderr(stderr) {
841
866
  ${tail}`;
842
867
  }
843
868
 
844
- // src/evaluation/graders/composite.ts
845
- import { generateText as generateText2 } from "ai";
846
-
847
869
  // src/evaluation/graders/llm-grader.ts
848
870
  import fs from "node:fs/promises";
849
871
  import path3 from "node:path";
850
- import { generateText, stepCountIs, tool } from "ai";
851
872
  import { z } from "zod";
852
873
 
853
874
  // src/evaluation/content-preprocessor.ts
@@ -1357,18 +1378,15 @@ ${context.toolCalls}`;
1357
1378
  }
1358
1379
  }
1359
1380
  // ---------------------------------------------------------------------------
1360
- // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools)
1381
+ // Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
1361
1382
  // ---------------------------------------------------------------------------
1362
1383
  /**
1363
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
1384
+ * Built-in mode: drives the grader through provider.invoke() with the
1385
+ * sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
1386
+ * provider runs the agent loop (tool call → tool execute → next model
1387
+ * turn) until the model stops requesting tools or maxSteps is hit.
1364
1388
  */
1365
1389
  async evaluateBuiltIn(context, graderProvider) {
1366
- const model = graderProvider.asLanguageModel?.();
1367
- if (!model) {
1368
- throw new Error(
1369
- `Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
1370
- );
1371
- }
1372
1390
  const workspacePath = context.workspacePath;
1373
1391
  if (!workspacePath) {
1374
1392
  throw new Error(
@@ -1387,18 +1405,21 @@ ${context.toolCalls}`;
1387
1405
  maxSteps: this.maxSteps
1388
1406
  };
1389
1407
  try {
1390
- const { text, steps } = await generateText({
1391
- model,
1392
- system: systemPrompt,
1393
- prompt: userPrompt,
1408
+ const response = await graderProvider.invoke({
1409
+ question: userPrompt,
1410
+ systemPrompt,
1411
+ evalCaseId: context.evalCase.id,
1412
+ attempt: context.attempt,
1413
+ temperature: this.temperature ?? 0,
1394
1414
  tools: fsTools,
1395
- stopWhen: stepCountIs(this.maxSteps),
1396
- temperature: this.temperature ?? 0
1415
+ maxSteps: this.maxSteps
1397
1416
  });
1398
- const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
1417
+ const text = extractLastAssistantContent(response.output);
1418
+ const stepCount = response.steps?.count ?? 1;
1419
+ const toolCallCount = response.steps?.toolCallCount ?? 0;
1399
1420
  const details = {
1400
1421
  mode: "built-in",
1401
- steps: steps.length,
1422
+ steps: stepCount,
1402
1423
  tool_calls: toolCallCount
1403
1424
  };
1404
1425
  return this.parseAgentResult(
@@ -1850,43 +1871,14 @@ ${outputSchema}`;
1850
1871
  }
1851
1872
  async generateStructuredResponse(options) {
1852
1873
  const { context, graderProvider, systemPrompt, userPrompt, images } = options;
1853
- const model = graderProvider.asLanguageModel?.();
1854
- if (model) {
1855
- const modelOptions = {
1856
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
1857
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
1858
- };
1859
- const hasImages = images && images.length > 0;
1860
- const result = hasImages ? await generateText({
1861
- model,
1862
- system: systemPrompt,
1863
- messages: [
1864
- {
1865
- role: "user",
1866
- content: [
1867
- { type: "text", text: userPrompt },
1868
- ...toAiSdkImageParts(images)
1869
- ]
1870
- }
1871
- ],
1872
- ...modelOptions
1873
- }) : await generateText({
1874
- model,
1875
- system: systemPrompt,
1876
- prompt: userPrompt,
1877
- ...modelOptions
1878
- });
1879
- const rawUsage = result.usage;
1880
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
1881
- return { text: result.text, tokenUsage };
1882
- }
1883
1874
  const response = await graderProvider.invoke({
1884
1875
  question: userPrompt,
1885
1876
  systemPrompt,
1886
1877
  evalCaseId: context.evalCase.id,
1887
1878
  attempt: context.attempt,
1888
1879
  maxOutputTokens: this.maxOutputTokens,
1889
- temperature: this.temperature
1880
+ temperature: this.temperature,
1881
+ ...images && images.length > 0 ? { images } : {}
1890
1882
  });
1891
1883
  return {
1892
1884
  text: extractLastAssistantContent(response.output),
@@ -2083,13 +2075,6 @@ function extractImageBlocks(messages) {
2083
2075
  }
2084
2076
  return images;
2085
2077
  }
2086
- function toAiSdkImageParts(images) {
2087
- return images.map((img) => ({
2088
- type: "image",
2089
- image: img.source,
2090
- mediaType: img.media_type || void 0
2091
- }));
2092
- }
2093
2078
  function resolveSandboxed(basePath, relativePath) {
2094
2079
  const resolved = path3.resolve(basePath, relativePath);
2095
2080
  if (!resolved.startsWith(basePath + path3.sep) && resolved !== basePath) {
@@ -2098,15 +2083,24 @@ function resolveSandboxed(basePath, relativePath) {
2098
2083
  return resolved;
2099
2084
  }
2100
2085
  function createFilesystemTools(workspacePath) {
2101
- return {
2102
- list_files: tool({
2086
+ return [
2087
+ {
2088
+ name: "list_files",
2103
2089
  description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
2104
- inputSchema: z.object({
2105
- path: z.string().describe('Relative path within workspace (use "." for root)').default(".")
2106
- }),
2090
+ parameters: {
2091
+ type: "object",
2092
+ properties: {
2093
+ path: {
2094
+ type: "string",
2095
+ description: 'Relative path within workspace (use "." for root)',
2096
+ default: "."
2097
+ }
2098
+ }
2099
+ },
2107
2100
  execute: async (input) => {
2101
+ const args = input ?? {};
2108
2102
  try {
2109
- const resolved = resolveSandboxed(workspacePath, input.path);
2103
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
2110
2104
  const entries = await fs.readdir(resolved, { withFileTypes: true });
2111
2105
  return entries.map((e) => ({
2112
2106
  name: e.name,
@@ -2116,18 +2110,25 @@ function createFilesystemTools(workspacePath) {
2116
2110
  return { error: error instanceof Error ? error.message : String(error) };
2117
2111
  }
2118
2112
  }
2119
- }),
2120
- read_file: tool({
2113
+ },
2114
+ {
2115
+ name: "read_file",
2121
2116
  description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
2122
- inputSchema: z.object({
2123
- path: z.string().describe("Relative path to file within workspace")
2124
- }),
2117
+ parameters: {
2118
+ type: "object",
2119
+ properties: {
2120
+ path: { type: "string", description: "Relative path to file within workspace" }
2121
+ },
2122
+ required: ["path"]
2123
+ },
2125
2124
  execute: async (input) => {
2125
+ const args = input ?? {};
2126
+ const relPath = args.path ?? "";
2126
2127
  try {
2127
- const resolved = resolveSandboxed(workspacePath, input.path);
2128
+ const resolved = resolveSandboxed(workspacePath, relPath);
2128
2129
  const stat10 = await fs.stat(resolved);
2129
2130
  if (stat10.isDirectory()) {
2130
- return { error: `'${input.path}' is a directory, not a file` };
2131
+ return { error: `'${relPath}' is a directory, not a file` };
2131
2132
  }
2132
2133
  const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
2133
2134
  const fd = await fs.open(resolved, "r");
@@ -2143,19 +2144,29 @@ function createFilesystemTools(workspacePath) {
2143
2144
  return { error: error instanceof Error ? error.message : String(error) };
2144
2145
  }
2145
2146
  }
2146
- }),
2147
- search_files: tool({
2147
+ },
2148
+ {
2149
+ name: "search_files",
2148
2150
  description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
2149
- inputSchema: z.object({
2150
- pattern: z.string().describe("Regex pattern to search for"),
2151
- path: z.string().describe('Relative path to search within (use "." for root)').default(".")
2152
- }),
2151
+ parameters: {
2152
+ type: "object",
2153
+ properties: {
2154
+ pattern: { type: "string", description: "Regex pattern to search for" },
2155
+ path: {
2156
+ type: "string",
2157
+ description: 'Relative path to search within (use "." for root)',
2158
+ default: "."
2159
+ }
2160
+ },
2161
+ required: ["pattern"]
2162
+ },
2153
2163
  execute: async (input) => {
2164
+ const args = input ?? {};
2154
2165
  try {
2155
- const resolved = resolveSandboxed(workspacePath, input.path);
2166
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
2156
2167
  let regex;
2157
2168
  try {
2158
- regex = new RegExp(input.pattern, "gi");
2169
+ regex = new RegExp(args.pattern ?? "", "gi");
2159
2170
  } catch (regexErr) {
2160
2171
  return {
2161
2172
  error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
@@ -2168,8 +2179,8 @@ function createFilesystemTools(workspacePath) {
2168
2179
  return { error: error instanceof Error ? error.message : String(error) };
2169
2180
  }
2170
2181
  }
2171
- })
2172
- };
2182
+ }
2183
+ ];
2173
2184
  }
2174
2185
  async function searchDirectory(dirPath, workspacePath, regex, matches) {
2175
2186
  if (matches.length >= MAX_SEARCH_MATCHES) return;
@@ -2449,25 +2460,6 @@ var CompositeGrader = class {
2449
2460
  target: graderProvider.targetName
2450
2461
  };
2451
2462
  try {
2452
- const model = graderProvider.asLanguageModel?.();
2453
- if (model) {
2454
- const { text } = await generateText2({
2455
- model,
2456
- system: systemPrompt,
2457
- prompt: userPrompt
2458
- });
2459
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
2460
- const score2 = clampScore(data2.score);
2461
- const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
2462
- return {
2463
- score: score2,
2464
- verdict: scoreToVerdict(score2),
2465
- assertions: assertions2,
2466
- expectedAspectCount: Math.max(assertions2.length, 1),
2467
- graderRawRequest,
2468
- scores
2469
- };
2470
- }
2471
2463
  const response = await graderProvider.invoke({
2472
2464
  question: userPrompt,
2473
2465
  systemPrompt,
@@ -2625,7 +2617,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
2625
2617
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
2626
2618
  if (summary.eventCount === 0) return void 0;
2627
2619
  const explorationCalls = explorationTools.reduce(
2628
- (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
2620
+ (sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
2629
2621
  0
2630
2622
  );
2631
2623
  return explorationCalls / summary.eventCount;
@@ -4261,422 +4253,6 @@ function runEqualsAssertion(output, value) {
4261
4253
  };
4262
4254
  }
4263
4255
 
4264
- // src/evaluation/providers/ai-sdk.ts
4265
- import { createAnthropic } from "@ai-sdk/anthropic";
4266
- import { createAzure } from "@ai-sdk/azure";
4267
- import { createGoogleGenerativeAI } from "@ai-sdk/google";
4268
- import { createOpenAI } from "@ai-sdk/openai";
4269
- import { createOpenRouter } from "@openrouter/ai-sdk-provider";
4270
- import { generateText as generateText3 } from "ai";
4271
- var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
4272
- var OpenAIProvider = class {
4273
- constructor(targetName, config) {
4274
- this.config = config;
4275
- this.id = `openai:${targetName}`;
4276
- this.targetName = targetName;
4277
- this.defaults = {
4278
- temperature: config.temperature,
4279
- maxOutputTokens: config.maxOutputTokens
4280
- };
4281
- this.retryConfig = config.retry;
4282
- const openai = createOpenAI({
4283
- apiKey: config.apiKey,
4284
- baseURL: config.baseURL
4285
- });
4286
- this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
4287
- }
4288
- id;
4289
- kind = "openai";
4290
- targetName;
4291
- model;
4292
- defaults;
4293
- retryConfig;
4294
- async invoke(request) {
4295
- return invokeModel({
4296
- model: this.model,
4297
- request,
4298
- defaults: this.defaults,
4299
- retryConfig: this.retryConfig
4300
- });
4301
- }
4302
- asLanguageModel() {
4303
- return this.model;
4304
- }
4305
- };
4306
- var AzureProvider = class {
4307
- constructor(targetName, config) {
4308
- this.config = config;
4309
- this.id = `azure:${targetName}`;
4310
- this.targetName = targetName;
4311
- this.defaults = {
4312
- temperature: config.temperature,
4313
- maxOutputTokens: config.maxOutputTokens
4314
- };
4315
- this.retryConfig = config.retry;
4316
- const azure = createAzure(buildAzureOptions(config));
4317
- this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
4318
- }
4319
- id;
4320
- kind = "azure";
4321
- targetName;
4322
- model;
4323
- defaults;
4324
- retryConfig;
4325
- async invoke(request) {
4326
- return invokeModel({
4327
- model: this.model,
4328
- request,
4329
- defaults: this.defaults,
4330
- retryConfig: this.retryConfig
4331
- });
4332
- }
4333
- asLanguageModel() {
4334
- return this.model;
4335
- }
4336
- };
4337
- var OpenRouterProvider = class {
4338
- constructor(targetName, config) {
4339
- this.config = config;
4340
- this.id = `openrouter:${targetName}`;
4341
- this.targetName = targetName;
4342
- this.defaults = {
4343
- temperature: config.temperature,
4344
- maxOutputTokens: config.maxOutputTokens
4345
- };
4346
- this.retryConfig = config.retry;
4347
- const openrouter = createOpenRouter({
4348
- apiKey: config.apiKey
4349
- });
4350
- this.model = openrouter(config.model);
4351
- }
4352
- id;
4353
- kind = "openrouter";
4354
- targetName;
4355
- model;
4356
- defaults;
4357
- retryConfig;
4358
- async invoke(request) {
4359
- return invokeModel({
4360
- model: this.model,
4361
- request,
4362
- defaults: this.defaults,
4363
- retryConfig: this.retryConfig
4364
- });
4365
- }
4366
- asLanguageModel() {
4367
- return this.model;
4368
- }
4369
- };
4370
- var AnthropicProvider = class {
4371
- constructor(targetName, config) {
4372
- this.config = config;
4373
- this.id = `anthropic:${targetName}`;
4374
- this.targetName = targetName;
4375
- this.defaults = {
4376
- temperature: config.temperature,
4377
- maxOutputTokens: config.maxOutputTokens,
4378
- thinkingBudget: config.thinkingBudget
4379
- };
4380
- this.retryConfig = config.retry;
4381
- const anthropic = createAnthropic({
4382
- apiKey: config.apiKey
4383
- });
4384
- this.model = anthropic(config.model);
4385
- }
4386
- id;
4387
- kind = "anthropic";
4388
- targetName;
4389
- model;
4390
- defaults;
4391
- retryConfig;
4392
- async invoke(request) {
4393
- const providerOptions = buildAnthropicProviderOptions(this.defaults);
4394
- return invokeModel({
4395
- model: this.model,
4396
- request,
4397
- defaults: this.defaults,
4398
- retryConfig: this.retryConfig,
4399
- providerOptions
4400
- });
4401
- }
4402
- asLanguageModel() {
4403
- return this.model;
4404
- }
4405
- };
4406
- var GeminiProvider = class {
4407
- constructor(targetName, config) {
4408
- this.config = config;
4409
- this.id = `gemini:${targetName}`;
4410
- this.targetName = targetName;
4411
- this.defaults = {
4412
- temperature: config.temperature,
4413
- maxOutputTokens: config.maxOutputTokens
4414
- };
4415
- this.retryConfig = config.retry;
4416
- const google = createGoogleGenerativeAI({
4417
- apiKey: config.apiKey
4418
- });
4419
- this.model = google(config.model);
4420
- }
4421
- id;
4422
- kind = "gemini";
4423
- targetName;
4424
- model;
4425
- defaults;
4426
- retryConfig;
4427
- async invoke(request) {
4428
- return invokeModel({
4429
- model: this.model,
4430
- request,
4431
- defaults: this.defaults,
4432
- retryConfig: this.retryConfig
4433
- });
4434
- }
4435
- asLanguageModel() {
4436
- return this.model;
4437
- }
4438
- };
4439
- function buildAzureOptions(config) {
4440
- const options = {
4441
- apiKey: config.apiKey,
4442
- apiVersion: config.version,
4443
- // Chat completions still use deployment-scoped Azure URLs for compatibility
4444
- // with existing deployments. Responses API should use the SDK's v1 path.
4445
- useDeploymentBasedUrls: config.apiFormat !== "responses"
4446
- };
4447
- const baseURL = normalizeAzureBaseUrl(config.resourceName);
4448
- if (baseURL) {
4449
- options.baseURL = baseURL;
4450
- } else {
4451
- options.resourceName = config.resourceName;
4452
- }
4453
- return options;
4454
- }
4455
- function normalizeAzureBaseUrl(resourceName) {
4456
- const trimmed = resourceName.trim();
4457
- if (!/^https?:\/\//i.test(trimmed)) {
4458
- return void 0;
4459
- }
4460
- const withoutSlash = trimmed.replace(/\/+$/, "");
4461
- const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
4462
- return normalized;
4463
- }
4464
- function buildAnthropicProviderOptions(defaults) {
4465
- if (defaults.thinkingBudget === void 0) {
4466
- return void 0;
4467
- }
4468
- return {
4469
- anthropic: {
4470
- thinking: {
4471
- type: "enabled",
4472
- budgetTokens: defaults.thinkingBudget
4473
- }
4474
- }
4475
- };
4476
- }
4477
- function buildChatPrompt(request) {
4478
- const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
4479
- if (provided) {
4480
- const hasSystemMessage = provided.some((message) => message.role === "system");
4481
- if (hasSystemMessage) {
4482
- return provided;
4483
- }
4484
- const systemContent2 = resolveSystemContent(request);
4485
- return [{ role: "system", content: systemContent2 }, ...provided];
4486
- }
4487
- const systemContent = resolveSystemContent(request);
4488
- const userContent = request.question.trim();
4489
- const prompt = [
4490
- { role: "system", content: systemContent },
4491
- { role: "user", content: userContent }
4492
- ];
4493
- return prompt;
4494
- }
4495
- function resolveSystemContent(request) {
4496
- const systemSegments = [];
4497
- if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
4498
- systemSegments.push(request.systemPrompt.trim());
4499
- } else {
4500
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
4501
- }
4502
- return systemSegments.join("\n\n");
4503
- }
4504
- function toModelMessages(chatPrompt) {
4505
- return chatPrompt.map((message) => {
4506
- if (message.role === "tool" || message.role === "function") {
4507
- const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
4508
- return {
4509
- role: "assistant",
4510
- content: `${prefix}${message.content}`
4511
- };
4512
- }
4513
- if (message.role === "assistant" || message.role === "system" || message.role === "user") {
4514
- return {
4515
- role: message.role,
4516
- content: message.content
4517
- };
4518
- }
4519
- return {
4520
- role: "user",
4521
- content: message.content
4522
- };
4523
- });
4524
- }
4525
- function resolveModelSettings(request, defaults) {
4526
- const temperature = request.temperature ?? defaults.temperature;
4527
- const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
4528
- return {
4529
- temperature,
4530
- maxOutputTokens
4531
- };
4532
- }
4533
- async function invokeModel(options) {
4534
- const { model, request, defaults, retryConfig, providerOptions } = options;
4535
- const chatPrompt = buildChatPrompt(request);
4536
- const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
4537
- const startTime = (/* @__PURE__ */ new Date()).toISOString();
4538
- const startMs = Date.now();
4539
- const result = await withRetry(
4540
- () => generateText3({
4541
- model,
4542
- messages: toModelMessages(chatPrompt),
4543
- temperature,
4544
- maxOutputTokens,
4545
- maxRetries: 0,
4546
- abortSignal: request.signal,
4547
- ...providerOptions ? { providerOptions } : {}
4548
- }),
4549
- retryConfig,
4550
- request.signal
4551
- );
4552
- const endTime = (/* @__PURE__ */ new Date()).toISOString();
4553
- const durationMs = Date.now() - startMs;
4554
- return mapResponse(result, { durationMs, startTime, endTime });
4555
- }
4556
- function mapResponse(result, timing) {
4557
- const content = result.text ?? "";
4558
- const rawUsage = result.totalUsage ?? result.usage;
4559
- const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
4560
- const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
4561
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
4562
- input: rawUsage.inputTokens,
4563
- output: rawUsage.outputTokens,
4564
- ...reasoning != null ? { reasoning } : {},
4565
- ...cached != null ? { cached } : {}
4566
- } : void 0;
4567
- return {
4568
- raw: result,
4569
- usage: toJsonObject(rawUsage),
4570
- output: [{ role: "assistant", content }],
4571
- tokenUsage,
4572
- durationMs: timing?.durationMs,
4573
- startTime: timing?.startTime,
4574
- endTime: timing?.endTime
4575
- };
4576
- }
4577
- function toJsonObject(value) {
4578
- if (!value || typeof value !== "object") {
4579
- return void 0;
4580
- }
4581
- try {
4582
- return JSON.parse(JSON.stringify(value));
4583
- } catch {
4584
- return void 0;
4585
- }
4586
- }
4587
- function extractStatus(error) {
4588
- if (!error || typeof error !== "object") {
4589
- return void 0;
4590
- }
4591
- const candidate = error;
4592
- const directStatus = candidate.status ?? candidate.statusCode;
4593
- if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
4594
- return directStatus;
4595
- }
4596
- const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
4597
- if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
4598
- return responseStatus;
4599
- }
4600
- const message = typeof candidate.message === "string" ? candidate.message : void 0;
4601
- if (message) {
4602
- const match = message.match(/HTTP\s+(\d{3})/i);
4603
- if (match) {
4604
- const parsed = Number.parseInt(match[1], 10);
4605
- if (Number.isFinite(parsed)) {
4606
- return parsed;
4607
- }
4608
- }
4609
- }
4610
- return void 0;
4611
- }
4612
- function isNetworkError(error) {
4613
- if (!error || typeof error !== "object") {
4614
- return false;
4615
- }
4616
- const candidate = error;
4617
- if (candidate.name === "AbortError") {
4618
- return false;
4619
- }
4620
- const code = candidate.code;
4621
- if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
4622
- return true;
4623
- }
4624
- const message = typeof candidate.message === "string" ? candidate.message : void 0;
4625
- if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
4626
- return true;
4627
- }
4628
- return false;
4629
- }
4630
- function isRetryableError(error, retryableStatusCodes) {
4631
- const status = extractStatus(error);
4632
- if (status === 401 || status === 403) {
4633
- return false;
4634
- }
4635
- if (typeof status === "number") {
4636
- return retryableStatusCodes.includes(status);
4637
- }
4638
- return isNetworkError(error);
4639
- }
4640
- function calculateRetryDelay(attempt, config) {
4641
- const delay = Math.min(
4642
- config.maxDelayMs,
4643
- config.initialDelayMs * config.backoffFactor ** attempt
4644
- );
4645
- return delay * (0.75 + Math.random() * 0.5);
4646
- }
4647
- async function sleep(ms) {
4648
- return new Promise((resolve) => setTimeout(resolve, ms));
4649
- }
4650
- async function withRetry(fn, retryConfig, signal) {
4651
- const config = {
4652
- maxRetries: retryConfig?.maxRetries ?? 3,
4653
- initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
4654
- maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
4655
- backoffFactor: retryConfig?.backoffFactor ?? 2,
4656
- retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
4657
- };
4658
- let lastError;
4659
- for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
4660
- if (signal?.aborted) {
4661
- throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
4662
- }
4663
- try {
4664
- return await fn();
4665
- } catch (error) {
4666
- lastError = error;
4667
- if (attempt >= config.maxRetries) {
4668
- break;
4669
- }
4670
- if (!isRetryableError(error, config.retryableStatusCodes)) {
4671
- throw error;
4672
- }
4673
- const delay = calculateRetryDelay(attempt, config);
4674
- await sleep(delay);
4675
- }
4676
- }
4677
- throw lastError;
4678
- }
4679
-
4680
4256
  // src/evaluation/providers/claude-cli.ts
4681
4257
  import { spawn } from "node:child_process";
4682
4258
  import { randomUUID } from "node:crypto";
@@ -9054,10 +8630,10 @@ function extractToolCallsFromEvents(events) {
9054
8630
  }
9055
8631
  }
9056
8632
  const toolCalls = [];
9057
- for (const [id, { tool: tool2, input }] of starts) {
8633
+ for (const [id, { tool, input }] of starts) {
9058
8634
  toolCalls.push(
9059
8635
  normalizeToolCall("pi-cli", {
9060
- tool: tool2,
8636
+ tool,
9061
8637
  input,
9062
8638
  id: id.startsWith("anon-") ? void 0 : id,
9063
8639
  output: results.get(id)
@@ -10124,7 +9700,7 @@ import { readFile as readFile5 } from "node:fs/promises";
10124
9700
  import path20 from "node:path";
10125
9701
 
10126
9702
  // src/evaluation/providers/vscode/utils/time.ts
10127
- function sleep2(ms) {
9703
+ function sleep(ms) {
10128
9704
  return new Promise((resolve) => {
10129
9705
  setTimeout(resolve, ms);
10130
9706
  });
@@ -10147,7 +9723,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
10147
9723
  }
10148
9724
  return false;
10149
9725
  }
10150
- await sleep2(pollInterval);
9726
+ await sleep(pollInterval);
10151
9727
  }
10152
9728
  } catch (error) {
10153
9729
  if (error.code === "ENOENT") {
@@ -10173,7 +9749,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
10173
9749
  }
10174
9750
  return false;
10175
9751
  }
10176
- await sleep2(pollInterval);
9752
+ await sleep(pollInterval);
10177
9753
  }
10178
9754
  }
10179
9755
  return false;
@@ -10202,7 +9778,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
10202
9778
  }
10203
9779
  }
10204
9780
  if (pending.size > 0) {
10205
- await sleep2(pollInterval);
9781
+ await sleep(pollInterval);
10206
9782
  }
10207
9783
  }
10208
9784
  } catch (error) {
@@ -10230,7 +9806,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
10230
9806
  }
10231
9807
  return false;
10232
9808
  }
10233
- await sleep2(pollInterval);
9809
+ await sleep(pollInterval);
10234
9810
  }
10235
9811
  }
10236
9812
  }
@@ -10326,7 +9902,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
10326
9902
  label: "open-workspace"
10327
9903
  });
10328
9904
  await raceSpawnError(workspaceChild);
10329
- await sleep2(100);
9905
+ await sleep(100);
10330
9906
  const wakeupChatId = "wakeup";
10331
9907
  const chatArgs = [
10332
9908
  "-r",
@@ -10343,7 +9919,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
10343
9919
  console.error(`warning: Workspace readiness timeout after ${timeout}s`);
10344
9920
  return false;
10345
9921
  }
10346
- await sleep2(pollInterval * 1e3);
9922
+ await sleep(pollInterval * 1e3);
10347
9923
  }
10348
9924
  return true;
10349
9925
  }
@@ -10371,7 +9947,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
10371
9947
  `VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
10372
9948
  );
10373
9949
  }
10374
- await sleep2(500);
9950
+ await sleep(500);
10375
9951
  const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-chat" });
10376
9952
  await raceSpawnError(child);
10377
9953
  }
@@ -10395,7 +9971,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
10395
9971
  `VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
10396
9972
  );
10397
9973
  }
10398
- await sleep2(500);
9974
+ await sleep(500);
10399
9975
  const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-batch-chat" });
10400
9976
  await raceSpawnError(child);
10401
9977
  }
@@ -16105,7 +15681,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
16105
15681
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
16106
15682
  }
16107
15683
  if (format === "typescript") {
16108
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-4CFPGHGT.js");
15684
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-5JMF2N65.js");
16109
15685
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16110
15686
  }
16111
15687
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -16140,7 +15716,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
16140
15716
  return loadTestsFromAgentSkills(evalFilePath);
16141
15717
  }
16142
15718
  if (format === "typescript") {
16143
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-4CFPGHGT.js");
15719
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-5JMF2N65.js");
16144
15720
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16145
15721
  return suite.tests;
16146
15722
  }
@@ -16496,7 +16072,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
16496
16072
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
16497
16073
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
16498
16074
  const docker = parseDockerWorkspaceConfig(obj.docker);
16499
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
16075
+ const env = parseWorkspaceEnvConfig(obj.env);
16076
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
16500
16077
  return void 0;
16501
16078
  return {
16502
16079
  ...template !== void 0 && { template },
@@ -16505,7 +16082,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
16505
16082
  ...hooks !== void 0 && { hooks },
16506
16083
  ...mode !== void 0 && { mode },
16507
16084
  ...workspacePath !== void 0 && { path: workspacePath },
16508
- ...docker !== void 0 && { docker }
16085
+ ...docker !== void 0 && { docker },
16086
+ ...env !== void 0 && { env }
16087
+ };
16088
+ }
16089
+ function parseWorkspaceEnvConfig(raw) {
16090
+ if (!isJsonObject(raw)) return void 0;
16091
+ const obj = raw;
16092
+ const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
16093
+ const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
16094
+ if (!required_commands?.length && !required_python_modules?.length) return void 0;
16095
+ return {
16096
+ ...required_commands?.length && { required_commands },
16097
+ ...required_python_modules?.length && { required_python_modules }
16509
16098
  };
16510
16099
  }
16511
16100
  function parseDockerWorkspaceConfig(raw) {
@@ -16865,7 +16454,7 @@ async function runEvaluation(options) {
16865
16454
  if (!cliModel) {
16866
16455
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
16867
16456
  }
16868
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-TXM4UEUT.js");
16457
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-MUIGGIP3.js");
16869
16458
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
16870
16459
  }
16871
16460
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -17196,6 +16785,19 @@ async function runEvaluation(options) {
17196
16785
  await dockerSetup.pullImage();
17197
16786
  setupLog("Docker image pull complete");
17198
16787
  }
16788
+ if (suiteWorkspace?.env) {
16789
+ try {
16790
+ await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
16791
+ setupLog("preflight checks passed");
16792
+ } catch (error) {
16793
+ const message = error instanceof Error ? error.message : String(error);
16794
+ if (sharedWorkspacePath && !useStaticWorkspace) {
16795
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16796
+ });
16797
+ }
16798
+ throw new Error(message);
16799
+ }
16800
+ }
17199
16801
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
17200
16802
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
17201
16803
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -18220,7 +17822,7 @@ async function runEvalCase(options) {
18220
17822
  lastError = error;
18221
17823
  if (attempt + 1 < attemptBudget) {
18222
17824
  const delayMs = retryBackoffMs(attempt);
18223
- await sleep3(delayMs, signal);
17825
+ await sleep2(delayMs, signal);
18224
17826
  attempt += 1;
18225
17827
  continue;
18226
17828
  }
@@ -19425,7 +19027,7 @@ function extractErrorMessage(error) {
19425
19027
  function retryBackoffMs(attempt) {
19426
19028
  return Math.min(2 ** attempt * 1e3, 3e4);
19427
19029
  }
19428
- function sleep3(ms, signal) {
19030
+ function sleep2(ms, signal) {
19429
19031
  if (signal?.aborted) return Promise.resolve();
19430
19032
  return new Promise((resolve) => {
19431
19033
  const timer = setTimeout(resolve, ms);
@@ -19466,6 +19068,38 @@ function computeWeightedMean(entries) {
19466
19068
  }
19467
19069
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
19468
19070
  }
19071
+ async function runPreflightChecks(env, cwd, log) {
19072
+ const execFileAsync4 = promisify7(execFile3);
19073
+ const missing = [];
19074
+ for (const cmd of env.required_commands ?? []) {
19075
+ log(`preflight: checking command "${cmd}"`);
19076
+ try {
19077
+ if (process.platform === "win32") {
19078
+ await execFileAsync4("where", [cmd], { cwd });
19079
+ } else {
19080
+ await execFileAsync4("sh", ["-c", `command -v ${cmd}`], { cwd });
19081
+ }
19082
+ } catch {
19083
+ missing.push(`command: ${cmd}`);
19084
+ }
19085
+ }
19086
+ for (const mod of env.required_python_modules ?? []) {
19087
+ log(`preflight: checking Python module "${mod}"`);
19088
+ try {
19089
+ await execFileAsync4("python3", ["-c", `import ${mod}`], { cwd });
19090
+ } catch {
19091
+ missing.push(`python module: ${mod}`);
19092
+ }
19093
+ }
19094
+ if (missing.length > 0) {
19095
+ throw new Error(
19096
+ `Preflight checks failed \u2014 missing dependencies:
19097
+ ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
19098
+
19099
+ Install the missing dependencies before running this eval.`
19100
+ );
19101
+ }
19102
+ }
19469
19103
 
19470
19104
  // src/evaluation/providers/function-provider.ts
19471
19105
  function createFunctionProvider(taskFn) {
@@ -19954,4 +19588,4 @@ export {
19954
19588
  loadTestById,
19955
19589
  loadEvalCaseById
19956
19590
  };
19957
- //# sourceMappingURL=chunk-IXTJEXWN.js.map
19591
+ //# sourceMappingURL=chunk-F234XBWV.js.map