@agentv/core 4.25.1 → 4.25.3-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,14 +17,19 @@ import {
17
17
  readTextFile,
18
18
  resolveDelegatedTargetDefinition,
19
19
  resolveTargetDefinition
20
- } from "./chunk-6HLBKYE2.js";
20
+ } from "./chunk-CALQDF2Y.js";
21
21
  import {
22
22
  execFileWithStdin,
23
23
  execShellWithStdin
24
24
  } from "./chunk-3WGHC7LC.js";
25
25
  import {
26
- AgentvProvider
27
- } from "./chunk-PRNXHNLF.js";
26
+ AgentvProvider,
27
+ AnthropicProvider,
28
+ AzureProvider,
29
+ GeminiProvider,
30
+ OpenAIProvider,
31
+ OpenRouterProvider
32
+ } from "./chunk-5XV3FAAD.js";
28
33
 
29
34
  // src/evaluation/loaders/ts-eval-loader.ts
30
35
  import path46 from "node:path";
@@ -730,6 +735,8 @@ var CodeGrader = class {
730
735
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
731
736
  try {
732
737
  let stdout;
738
+ let exitCode = 0;
739
+ let execStderr = "";
733
740
  if (context.dockerConfig) {
734
741
  const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27.js");
735
742
  const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig);
@@ -738,31 +745,42 @@ var CodeGrader = class {
738
745
  stdin: inputPayload,
739
746
  repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos)
740
747
  });
741
- if (result.exitCode !== 0) {
742
- const trimmedErr = result.stderr.trim();
743
- throw new Error(
744
- trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
745
- );
746
- }
748
+ exitCode = result.exitCode;
747
749
  stdout = result.stdout.trim();
750
+ execStderr = result.stderr;
748
751
  } else {
749
- stdout = await executeScript(
752
+ const result = await runScriptRaw(
750
753
  this.command,
751
754
  inputPayload,
752
755
  this.agentTimeoutMs,
753
756
  this.cwd,
754
757
  env
755
758
  );
759
+ exitCode = result.exitCode;
760
+ stdout = result.stdout.trim();
761
+ execStderr = result.stderr;
756
762
  }
757
- const parsed = parseJsonSafe(stdout);
758
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
759
- const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
763
+ const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
764
+ const hasStderr = execStderr.trim().length > 0;
765
+ if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
766
+ const trimmedErr = formatStderr(execStderr);
767
+ throw new Error(
768
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
769
+ );
770
+ }
771
+ const rawParsed = parseJsonSafe(stdout);
772
+ const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
773
+ const passed = exitCode === 0;
774
+ const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
760
775
  (a) => typeof a === "object" && a !== null && typeof a.text === "string"
761
776
  ).map((a) => ({
762
777
  text: String(a.text),
763
778
  passed: Boolean(a.passed),
764
779
  ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
765
- })) : [];
780
+ })) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
781
+ const score = parsed != null ? clampScore(
782
+ typeof parsed.score === "number" ? parsed.score : assertions.length > 0 ? assertions.filter((a) => a.passed).length / assertions.length : 0
783
+ ) : passed ? 1 : 0;
766
784
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
767
785
  const proxyUsage = getProxyUsage?.();
768
786
  const graderRawRequest = {
@@ -820,8 +838,17 @@ var CodeGrader = class {
820
838
  }
821
839
  }
822
840
  };
841
+ async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
842
+ return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
843
+ }
823
844
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
824
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
845
+ const { stdout, stderr, exitCode } = await runScriptRaw(
846
+ scriptPath,
847
+ input,
848
+ agentTimeoutMs,
849
+ cwd,
850
+ env
851
+ );
825
852
  if (exitCode !== 0) {
826
853
  const trimmedErr = formatStderr(stderr);
827
854
  throw new Error(
@@ -841,13 +868,9 @@ function formatStderr(stderr) {
841
868
  ${tail}`;
842
869
  }
843
870
 
844
- // src/evaluation/graders/composite.ts
845
- import { generateText as generateText2 } from "ai";
846
-
847
871
  // src/evaluation/graders/llm-grader.ts
848
872
  import fs from "node:fs/promises";
849
873
  import path3 from "node:path";
850
- import { generateText, stepCountIs, tool } from "ai";
851
874
  import { z } from "zod";
852
875
 
853
876
  // src/evaluation/content-preprocessor.ts
@@ -1357,18 +1380,15 @@ ${context.toolCalls}`;
1357
1380
  }
1358
1381
  }
1359
1382
  // ---------------------------------------------------------------------------
1360
- // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools)
1383
+ // Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
1361
1384
  // ---------------------------------------------------------------------------
1362
1385
  /**
1363
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
1386
+ * Built-in mode: drives the grader through provider.invoke() with the
1387
+ * sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
1388
+ * provider runs the agent loop (tool call → tool execute → next model
1389
+ * turn) until the model stops requesting tools or maxSteps is hit.
1364
1390
  */
1365
1391
  async evaluateBuiltIn(context, graderProvider) {
1366
- const model = graderProvider.asLanguageModel?.();
1367
- if (!model) {
1368
- throw new Error(
1369
- `Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
1370
- );
1371
- }
1372
1392
  const workspacePath = context.workspacePath;
1373
1393
  if (!workspacePath) {
1374
1394
  throw new Error(
@@ -1387,18 +1407,21 @@ ${context.toolCalls}`;
1387
1407
  maxSteps: this.maxSteps
1388
1408
  };
1389
1409
  try {
1390
- const { text, steps } = await generateText({
1391
- model,
1392
- system: systemPrompt,
1393
- prompt: userPrompt,
1410
+ const response = await graderProvider.invoke({
1411
+ question: userPrompt,
1412
+ systemPrompt,
1413
+ evalCaseId: context.evalCase.id,
1414
+ attempt: context.attempt,
1415
+ temperature: this.temperature ?? 0,
1394
1416
  tools: fsTools,
1395
- stopWhen: stepCountIs(this.maxSteps),
1396
- temperature: this.temperature ?? 0
1417
+ maxSteps: this.maxSteps
1397
1418
  });
1398
- const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
1419
+ const text = extractLastAssistantContent(response.output);
1420
+ const stepCount = response.steps?.count ?? 1;
1421
+ const toolCallCount = response.steps?.toolCallCount ?? 0;
1399
1422
  const details = {
1400
1423
  mode: "built-in",
1401
- steps: steps.length,
1424
+ steps: stepCount,
1402
1425
  tool_calls: toolCallCount
1403
1426
  };
1404
1427
  return this.parseAgentResult(
@@ -1850,43 +1873,14 @@ ${outputSchema}`;
1850
1873
  }
1851
1874
  async generateStructuredResponse(options) {
1852
1875
  const { context, graderProvider, systemPrompt, userPrompt, images } = options;
1853
- const model = graderProvider.asLanguageModel?.();
1854
- if (model) {
1855
- const modelOptions = {
1856
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
1857
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
1858
- };
1859
- const hasImages = images && images.length > 0;
1860
- const result = hasImages ? await generateText({
1861
- model,
1862
- system: systemPrompt,
1863
- messages: [
1864
- {
1865
- role: "user",
1866
- content: [
1867
- { type: "text", text: userPrompt },
1868
- ...toAiSdkImageParts(images)
1869
- ]
1870
- }
1871
- ],
1872
- ...modelOptions
1873
- }) : await generateText({
1874
- model,
1875
- system: systemPrompt,
1876
- prompt: userPrompt,
1877
- ...modelOptions
1878
- });
1879
- const rawUsage = result.usage;
1880
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
1881
- return { text: result.text, tokenUsage };
1882
- }
1883
1876
  const response = await graderProvider.invoke({
1884
1877
  question: userPrompt,
1885
1878
  systemPrompt,
1886
1879
  evalCaseId: context.evalCase.id,
1887
1880
  attempt: context.attempt,
1888
1881
  maxOutputTokens: this.maxOutputTokens,
1889
- temperature: this.temperature
1882
+ temperature: this.temperature,
1883
+ ...images && images.length > 0 ? { images } : {}
1890
1884
  });
1891
1885
  return {
1892
1886
  text: extractLastAssistantContent(response.output),
@@ -2083,13 +2077,6 @@ function extractImageBlocks(messages) {
2083
2077
  }
2084
2078
  return images;
2085
2079
  }
2086
- function toAiSdkImageParts(images) {
2087
- return images.map((img) => ({
2088
- type: "image",
2089
- image: img.source,
2090
- mediaType: img.media_type || void 0
2091
- }));
2092
- }
2093
2080
  function resolveSandboxed(basePath, relativePath) {
2094
2081
  const resolved = path3.resolve(basePath, relativePath);
2095
2082
  if (!resolved.startsWith(basePath + path3.sep) && resolved !== basePath) {
@@ -2098,15 +2085,24 @@ function resolveSandboxed(basePath, relativePath) {
2098
2085
  return resolved;
2099
2086
  }
2100
2087
  function createFilesystemTools(workspacePath) {
2101
- return {
2102
- list_files: tool({
2088
+ return [
2089
+ {
2090
+ name: "list_files",
2103
2091
  description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
2104
- inputSchema: z.object({
2105
- path: z.string().describe('Relative path within workspace (use "." for root)').default(".")
2106
- }),
2092
+ parameters: {
2093
+ type: "object",
2094
+ properties: {
2095
+ path: {
2096
+ type: "string",
2097
+ description: 'Relative path within workspace (use "." for root)',
2098
+ default: "."
2099
+ }
2100
+ }
2101
+ },
2107
2102
  execute: async (input) => {
2103
+ const args = input ?? {};
2108
2104
  try {
2109
- const resolved = resolveSandboxed(workspacePath, input.path);
2105
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
2110
2106
  const entries = await fs.readdir(resolved, { withFileTypes: true });
2111
2107
  return entries.map((e) => ({
2112
2108
  name: e.name,
@@ -2116,18 +2112,25 @@ function createFilesystemTools(workspacePath) {
2116
2112
  return { error: error instanceof Error ? error.message : String(error) };
2117
2113
  }
2118
2114
  }
2119
- }),
2120
- read_file: tool({
2115
+ },
2116
+ {
2117
+ name: "read_file",
2121
2118
  description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
2122
- inputSchema: z.object({
2123
- path: z.string().describe("Relative path to file within workspace")
2124
- }),
2119
+ parameters: {
2120
+ type: "object",
2121
+ properties: {
2122
+ path: { type: "string", description: "Relative path to file within workspace" }
2123
+ },
2124
+ required: ["path"]
2125
+ },
2125
2126
  execute: async (input) => {
2127
+ const args = input ?? {};
2128
+ const relPath = args.path ?? "";
2126
2129
  try {
2127
- const resolved = resolveSandboxed(workspacePath, input.path);
2130
+ const resolved = resolveSandboxed(workspacePath, relPath);
2128
2131
  const stat10 = await fs.stat(resolved);
2129
2132
  if (stat10.isDirectory()) {
2130
- return { error: `'${input.path}' is a directory, not a file` };
2133
+ return { error: `'${relPath}' is a directory, not a file` };
2131
2134
  }
2132
2135
  const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
2133
2136
  const fd = await fs.open(resolved, "r");
@@ -2143,19 +2146,29 @@ function createFilesystemTools(workspacePath) {
2143
2146
  return { error: error instanceof Error ? error.message : String(error) };
2144
2147
  }
2145
2148
  }
2146
- }),
2147
- search_files: tool({
2149
+ },
2150
+ {
2151
+ name: "search_files",
2148
2152
  description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
2149
- inputSchema: z.object({
2150
- pattern: z.string().describe("Regex pattern to search for"),
2151
- path: z.string().describe('Relative path to search within (use "." for root)').default(".")
2152
- }),
2153
+ parameters: {
2154
+ type: "object",
2155
+ properties: {
2156
+ pattern: { type: "string", description: "Regex pattern to search for" },
2157
+ path: {
2158
+ type: "string",
2159
+ description: 'Relative path to search within (use "." for root)',
2160
+ default: "."
2161
+ }
2162
+ },
2163
+ required: ["pattern"]
2164
+ },
2153
2165
  execute: async (input) => {
2166
+ const args = input ?? {};
2154
2167
  try {
2155
- const resolved = resolveSandboxed(workspacePath, input.path);
2168
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
2156
2169
  let regex;
2157
2170
  try {
2158
- regex = new RegExp(input.pattern, "gi");
2171
+ regex = new RegExp(args.pattern ?? "", "gi");
2159
2172
  } catch (regexErr) {
2160
2173
  return {
2161
2174
  error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
@@ -2168,8 +2181,8 @@ function createFilesystemTools(workspacePath) {
2168
2181
  return { error: error instanceof Error ? error.message : String(error) };
2169
2182
  }
2170
2183
  }
2171
- })
2172
- };
2184
+ }
2185
+ ];
2173
2186
  }
2174
2187
  async function searchDirectory(dirPath, workspacePath, regex, matches) {
2175
2188
  if (matches.length >= MAX_SEARCH_MATCHES) return;
@@ -2449,25 +2462,6 @@ var CompositeGrader = class {
2449
2462
  target: graderProvider.targetName
2450
2463
  };
2451
2464
  try {
2452
- const model = graderProvider.asLanguageModel?.();
2453
- if (model) {
2454
- const { text } = await generateText2({
2455
- model,
2456
- system: systemPrompt,
2457
- prompt: userPrompt
2458
- });
2459
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
2460
- const score2 = clampScore(data2.score);
2461
- const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
2462
- return {
2463
- score: score2,
2464
- verdict: scoreToVerdict(score2),
2465
- assertions: assertions2,
2466
- expectedAspectCount: Math.max(assertions2.length, 1),
2467
- graderRawRequest,
2468
- scores
2469
- };
2470
- }
2471
2465
  const response = await graderProvider.invoke({
2472
2466
  question: userPrompt,
2473
2467
  systemPrompt,
@@ -2625,7 +2619,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
2625
2619
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
2626
2620
  if (summary.eventCount === 0) return void 0;
2627
2621
  const explorationCalls = explorationTools.reduce(
2628
- (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
2622
+ (sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
2629
2623
  0
2630
2624
  );
2631
2625
  return explorationCalls / summary.eventCount;
@@ -4261,422 +4255,6 @@ function runEqualsAssertion(output, value) {
4261
4255
  };
4262
4256
  }
4263
4257
 
4264
- // src/evaluation/providers/ai-sdk.ts
4265
- import { createAnthropic } from "@ai-sdk/anthropic";
4266
- import { createAzure } from "@ai-sdk/azure";
4267
- import { createGoogleGenerativeAI } from "@ai-sdk/google";
4268
- import { createOpenAI } from "@ai-sdk/openai";
4269
- import { createOpenRouter } from "@openrouter/ai-sdk-provider";
4270
- import { generateText as generateText3 } from "ai";
4271
- var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
4272
- var OpenAIProvider = class {
4273
- constructor(targetName, config) {
4274
- this.config = config;
4275
- this.id = `openai:${targetName}`;
4276
- this.targetName = targetName;
4277
- this.defaults = {
4278
- temperature: config.temperature,
4279
- maxOutputTokens: config.maxOutputTokens
4280
- };
4281
- this.retryConfig = config.retry;
4282
- const openai = createOpenAI({
4283
- apiKey: config.apiKey,
4284
- baseURL: config.baseURL
4285
- });
4286
- this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
4287
- }
4288
- id;
4289
- kind = "openai";
4290
- targetName;
4291
- model;
4292
- defaults;
4293
- retryConfig;
4294
- async invoke(request) {
4295
- return invokeModel({
4296
- model: this.model,
4297
- request,
4298
- defaults: this.defaults,
4299
- retryConfig: this.retryConfig
4300
- });
4301
- }
4302
- asLanguageModel() {
4303
- return this.model;
4304
- }
4305
- };
4306
- var AzureProvider = class {
4307
- constructor(targetName, config) {
4308
- this.config = config;
4309
- this.id = `azure:${targetName}`;
4310
- this.targetName = targetName;
4311
- this.defaults = {
4312
- temperature: config.temperature,
4313
- maxOutputTokens: config.maxOutputTokens
4314
- };
4315
- this.retryConfig = config.retry;
4316
- const azure = createAzure(buildAzureOptions(config));
4317
- this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
4318
- }
4319
- id;
4320
- kind = "azure";
4321
- targetName;
4322
- model;
4323
- defaults;
4324
- retryConfig;
4325
- async invoke(request) {
4326
- return invokeModel({
4327
- model: this.model,
4328
- request,
4329
- defaults: this.defaults,
4330
- retryConfig: this.retryConfig
4331
- });
4332
- }
4333
- asLanguageModel() {
4334
- return this.model;
4335
- }
4336
- };
4337
- var OpenRouterProvider = class {
4338
- constructor(targetName, config) {
4339
- this.config = config;
4340
- this.id = `openrouter:${targetName}`;
4341
- this.targetName = targetName;
4342
- this.defaults = {
4343
- temperature: config.temperature,
4344
- maxOutputTokens: config.maxOutputTokens
4345
- };
4346
- this.retryConfig = config.retry;
4347
- const openrouter = createOpenRouter({
4348
- apiKey: config.apiKey
4349
- });
4350
- this.model = openrouter(config.model);
4351
- }
4352
- id;
4353
- kind = "openrouter";
4354
- targetName;
4355
- model;
4356
- defaults;
4357
- retryConfig;
4358
- async invoke(request) {
4359
- return invokeModel({
4360
- model: this.model,
4361
- request,
4362
- defaults: this.defaults,
4363
- retryConfig: this.retryConfig
4364
- });
4365
- }
4366
- asLanguageModel() {
4367
- return this.model;
4368
- }
4369
- };
4370
- var AnthropicProvider = class {
4371
- constructor(targetName, config) {
4372
- this.config = config;
4373
- this.id = `anthropic:${targetName}`;
4374
- this.targetName = targetName;
4375
- this.defaults = {
4376
- temperature: config.temperature,
4377
- maxOutputTokens: config.maxOutputTokens,
4378
- thinkingBudget: config.thinkingBudget
4379
- };
4380
- this.retryConfig = config.retry;
4381
- const anthropic = createAnthropic({
4382
- apiKey: config.apiKey
4383
- });
4384
- this.model = anthropic(config.model);
4385
- }
4386
- id;
4387
- kind = "anthropic";
4388
- targetName;
4389
- model;
4390
- defaults;
4391
- retryConfig;
4392
- async invoke(request) {
4393
- const providerOptions = buildAnthropicProviderOptions(this.defaults);
4394
- return invokeModel({
4395
- model: this.model,
4396
- request,
4397
- defaults: this.defaults,
4398
- retryConfig: this.retryConfig,
4399
- providerOptions
4400
- });
4401
- }
4402
- asLanguageModel() {
4403
- return this.model;
4404
- }
4405
- };
4406
- var GeminiProvider = class {
4407
- constructor(targetName, config) {
4408
- this.config = config;
4409
- this.id = `gemini:${targetName}`;
4410
- this.targetName = targetName;
4411
- this.defaults = {
4412
- temperature: config.temperature,
4413
- maxOutputTokens: config.maxOutputTokens
4414
- };
4415
- this.retryConfig = config.retry;
4416
- const google = createGoogleGenerativeAI({
4417
- apiKey: config.apiKey
4418
- });
4419
- this.model = google(config.model);
4420
- }
4421
- id;
4422
- kind = "gemini";
4423
- targetName;
4424
- model;
4425
- defaults;
4426
- retryConfig;
4427
- async invoke(request) {
4428
- return invokeModel({
4429
- model: this.model,
4430
- request,
4431
- defaults: this.defaults,
4432
- retryConfig: this.retryConfig
4433
- });
4434
- }
4435
- asLanguageModel() {
4436
- return this.model;
4437
- }
4438
- };
4439
- function buildAzureOptions(config) {
4440
- const options = {
4441
- apiKey: config.apiKey,
4442
- apiVersion: config.version,
4443
- // Chat completions still use deployment-scoped Azure URLs for compatibility
4444
- // with existing deployments. Responses API should use the SDK's v1 path.
4445
- useDeploymentBasedUrls: config.apiFormat !== "responses"
4446
- };
4447
- const baseURL = normalizeAzureBaseUrl(config.resourceName);
4448
- if (baseURL) {
4449
- options.baseURL = baseURL;
4450
- } else {
4451
- options.resourceName = config.resourceName;
4452
- }
4453
- return options;
4454
- }
4455
- function normalizeAzureBaseUrl(resourceName) {
4456
- const trimmed = resourceName.trim();
4457
- if (!/^https?:\/\//i.test(trimmed)) {
4458
- return void 0;
4459
- }
4460
- const withoutSlash = trimmed.replace(/\/+$/, "");
4461
- const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
4462
- return normalized;
4463
- }
4464
- function buildAnthropicProviderOptions(defaults) {
4465
- if (defaults.thinkingBudget === void 0) {
4466
- return void 0;
4467
- }
4468
- return {
4469
- anthropic: {
4470
- thinking: {
4471
- type: "enabled",
4472
- budgetTokens: defaults.thinkingBudget
4473
- }
4474
- }
4475
- };
4476
- }
4477
- function buildChatPrompt(request) {
4478
- const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
4479
- if (provided) {
4480
- const hasSystemMessage = provided.some((message) => message.role === "system");
4481
- if (hasSystemMessage) {
4482
- return provided;
4483
- }
4484
- const systemContent2 = resolveSystemContent(request);
4485
- return [{ role: "system", content: systemContent2 }, ...provided];
4486
- }
4487
- const systemContent = resolveSystemContent(request);
4488
- const userContent = request.question.trim();
4489
- const prompt = [
4490
- { role: "system", content: systemContent },
4491
- { role: "user", content: userContent }
4492
- ];
4493
- return prompt;
4494
- }
4495
- function resolveSystemContent(request) {
4496
- const systemSegments = [];
4497
- if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
4498
- systemSegments.push(request.systemPrompt.trim());
4499
- } else {
4500
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
4501
- }
4502
- return systemSegments.join("\n\n");
4503
- }
4504
- function toModelMessages(chatPrompt) {
4505
- return chatPrompt.map((message) => {
4506
- if (message.role === "tool" || message.role === "function") {
4507
- const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
4508
- return {
4509
- role: "assistant",
4510
- content: `${prefix}${message.content}`
4511
- };
4512
- }
4513
- if (message.role === "assistant" || message.role === "system" || message.role === "user") {
4514
- return {
4515
- role: message.role,
4516
- content: message.content
4517
- };
4518
- }
4519
- return {
4520
- role: "user",
4521
- content: message.content
4522
- };
4523
- });
4524
- }
4525
- function resolveModelSettings(request, defaults) {
4526
- const temperature = request.temperature ?? defaults.temperature;
4527
- const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
4528
- return {
4529
- temperature,
4530
- maxOutputTokens
4531
- };
4532
- }
4533
- async function invokeModel(options) {
4534
- const { model, request, defaults, retryConfig, providerOptions } = options;
4535
- const chatPrompt = buildChatPrompt(request);
4536
- const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
4537
- const startTime = (/* @__PURE__ */ new Date()).toISOString();
4538
- const startMs = Date.now();
4539
- const result = await withRetry(
4540
- () => generateText3({
4541
- model,
4542
- messages: toModelMessages(chatPrompt),
4543
- temperature,
4544
- maxOutputTokens,
4545
- maxRetries: 0,
4546
- abortSignal: request.signal,
4547
- ...providerOptions ? { providerOptions } : {}
4548
- }),
4549
- retryConfig,
4550
- request.signal
4551
- );
4552
- const endTime = (/* @__PURE__ */ new Date()).toISOString();
4553
- const durationMs = Date.now() - startMs;
4554
- return mapResponse(result, { durationMs, startTime, endTime });
4555
- }
4556
- function mapResponse(result, timing) {
4557
- const content = result.text ?? "";
4558
- const rawUsage = result.totalUsage ?? result.usage;
4559
- const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
4560
- const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
4561
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
4562
- input: rawUsage.inputTokens,
4563
- output: rawUsage.outputTokens,
4564
- ...reasoning != null ? { reasoning } : {},
4565
- ...cached != null ? { cached } : {}
4566
- } : void 0;
4567
- return {
4568
- raw: result,
4569
- usage: toJsonObject(rawUsage),
4570
- output: [{ role: "assistant", content }],
4571
- tokenUsage,
4572
- durationMs: timing?.durationMs,
4573
- startTime: timing?.startTime,
4574
- endTime: timing?.endTime
4575
- };
4576
- }
4577
- function toJsonObject(value) {
4578
- if (!value || typeof value !== "object") {
4579
- return void 0;
4580
- }
4581
- try {
4582
- return JSON.parse(JSON.stringify(value));
4583
- } catch {
4584
- return void 0;
4585
- }
4586
- }
4587
- function extractStatus(error) {
4588
- if (!error || typeof error !== "object") {
4589
- return void 0;
4590
- }
4591
- const candidate = error;
4592
- const directStatus = candidate.status ?? candidate.statusCode;
4593
- if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
4594
- return directStatus;
4595
- }
4596
- const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
4597
- if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
4598
- return responseStatus;
4599
- }
4600
- const message = typeof candidate.message === "string" ? candidate.message : void 0;
4601
- if (message) {
4602
- const match = message.match(/HTTP\s+(\d{3})/i);
4603
- if (match) {
4604
- const parsed = Number.parseInt(match[1], 10);
4605
- if (Number.isFinite(parsed)) {
4606
- return parsed;
4607
- }
4608
- }
4609
- }
4610
- return void 0;
4611
- }
4612
- function isNetworkError(error) {
4613
- if (!error || typeof error !== "object") {
4614
- return false;
4615
- }
4616
- const candidate = error;
4617
- if (candidate.name === "AbortError") {
4618
- return false;
4619
- }
4620
- const code = candidate.code;
4621
- if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
4622
- return true;
4623
- }
4624
- const message = typeof candidate.message === "string" ? candidate.message : void 0;
4625
- if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
4626
- return true;
4627
- }
4628
- return false;
4629
- }
4630
- function isRetryableError(error, retryableStatusCodes) {
4631
- const status = extractStatus(error);
4632
- if (status === 401 || status === 403) {
4633
- return false;
4634
- }
4635
- if (typeof status === "number") {
4636
- return retryableStatusCodes.includes(status);
4637
- }
4638
- return isNetworkError(error);
4639
- }
4640
- function calculateRetryDelay(attempt, config) {
4641
- const delay = Math.min(
4642
- config.maxDelayMs,
4643
- config.initialDelayMs * config.backoffFactor ** attempt
4644
- );
4645
- return delay * (0.75 + Math.random() * 0.5);
4646
- }
4647
- async function sleep(ms) {
4648
- return new Promise((resolve) => setTimeout(resolve, ms));
4649
- }
4650
- async function withRetry(fn, retryConfig, signal) {
4651
- const config = {
4652
- maxRetries: retryConfig?.maxRetries ?? 3,
4653
- initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
4654
- maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
4655
- backoffFactor: retryConfig?.backoffFactor ?? 2,
4656
- retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
4657
- };
4658
- let lastError;
4659
- for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
4660
- if (signal?.aborted) {
4661
- throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
4662
- }
4663
- try {
4664
- return await fn();
4665
- } catch (error) {
4666
- lastError = error;
4667
- if (attempt >= config.maxRetries) {
4668
- break;
4669
- }
4670
- if (!isRetryableError(error, config.retryableStatusCodes)) {
4671
- throw error;
4672
- }
4673
- const delay = calculateRetryDelay(attempt, config);
4674
- await sleep(delay);
4675
- }
4676
- }
4677
- throw lastError;
4678
- }
4679
-
4680
4258
  // src/evaluation/providers/claude-cli.ts
4681
4259
  import { spawn } from "node:child_process";
4682
4260
  import { randomUUID } from "node:crypto";
@@ -9054,10 +8632,10 @@ function extractToolCallsFromEvents(events) {
9054
8632
  }
9055
8633
  }
9056
8634
  const toolCalls = [];
9057
- for (const [id, { tool: tool2, input }] of starts) {
8635
+ for (const [id, { tool, input }] of starts) {
9058
8636
  toolCalls.push(
9059
8637
  normalizeToolCall("pi-cli", {
9060
- tool: tool2,
8638
+ tool,
9061
8639
  input,
9062
8640
  id: id.startsWith("anon-") ? void 0 : id,
9063
8641
  output: results.get(id)
@@ -10124,7 +9702,7 @@ import { readFile as readFile5 } from "node:fs/promises";
10124
9702
  import path20 from "node:path";
10125
9703
 
10126
9704
  // src/evaluation/providers/vscode/utils/time.ts
10127
- function sleep2(ms) {
9705
+ function sleep(ms) {
10128
9706
  return new Promise((resolve) => {
10129
9707
  setTimeout(resolve, ms);
10130
9708
  });
@@ -10147,7 +9725,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
10147
9725
  }
10148
9726
  return false;
10149
9727
  }
10150
- await sleep2(pollInterval);
9728
+ await sleep(pollInterval);
10151
9729
  }
10152
9730
  } catch (error) {
10153
9731
  if (error.code === "ENOENT") {
@@ -10173,7 +9751,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
10173
9751
  }
10174
9752
  return false;
10175
9753
  }
10176
- await sleep2(pollInterval);
9754
+ await sleep(pollInterval);
10177
9755
  }
10178
9756
  }
10179
9757
  return false;
@@ -10202,7 +9780,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
10202
9780
  }
10203
9781
  }
10204
9782
  if (pending.size > 0) {
10205
- await sleep2(pollInterval);
9783
+ await sleep(pollInterval);
10206
9784
  }
10207
9785
  }
10208
9786
  } catch (error) {
@@ -10230,7 +9808,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
10230
9808
  }
10231
9809
  return false;
10232
9810
  }
10233
- await sleep2(pollInterval);
9811
+ await sleep(pollInterval);
10234
9812
  }
10235
9813
  }
10236
9814
  }
@@ -10326,7 +9904,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
10326
9904
  label: "open-workspace"
10327
9905
  });
10328
9906
  await raceSpawnError(workspaceChild);
10329
- await sleep2(100);
9907
+ await sleep(100);
10330
9908
  const wakeupChatId = "wakeup";
10331
9909
  const chatArgs = [
10332
9910
  "-r",
@@ -10343,7 +9921,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
10343
9921
  console.error(`warning: Workspace readiness timeout after ${timeout}s`);
10344
9922
  return false;
10345
9923
  }
10346
- await sleep2(pollInterval * 1e3);
9924
+ await sleep(pollInterval * 1e3);
10347
9925
  }
10348
9926
  return true;
10349
9927
  }
@@ -10371,7 +9949,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
10371
9949
  `VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
10372
9950
  );
10373
9951
  }
10374
- await sleep2(500);
9952
+ await sleep(500);
10375
9953
  const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-chat" });
10376
9954
  await raceSpawnError(child);
10377
9955
  }
@@ -10395,7 +9973,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
10395
9973
  `VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
10396
9974
  );
10397
9975
  }
10398
- await sleep2(500);
9976
+ await sleep(500);
10399
9977
  const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-batch-chat" });
10400
9978
  await raceSpawnError(child);
10401
9979
  }
@@ -16105,7 +15683,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
16105
15683
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
16106
15684
  }
16107
15685
  if (format === "typescript") {
16108
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-4CFPGHGT.js");
15686
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-E6MROJGR.js");
16109
15687
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16110
15688
  }
16111
15689
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -16140,7 +15718,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
16140
15718
  return loadTestsFromAgentSkills(evalFilePath);
16141
15719
  }
16142
15720
  if (format === "typescript") {
16143
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-4CFPGHGT.js");
15721
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-E6MROJGR.js");
16144
15722
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16145
15723
  return suite.tests;
16146
15724
  }
@@ -16496,7 +16074,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
16496
16074
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
16497
16075
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
16498
16076
  const docker = parseDockerWorkspaceConfig(obj.docker);
16499
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
16077
+ const env = parseWorkspaceEnvConfig(obj.env);
16078
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
16500
16079
  return void 0;
16501
16080
  return {
16502
16081
  ...template !== void 0 && { template },
@@ -16505,7 +16084,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
16505
16084
  ...hooks !== void 0 && { hooks },
16506
16085
  ...mode !== void 0 && { mode },
16507
16086
  ...workspacePath !== void 0 && { path: workspacePath },
16508
- ...docker !== void 0 && { docker }
16087
+ ...docker !== void 0 && { docker },
16088
+ ...env !== void 0 && { env }
16089
+ };
16090
+ }
16091
+ function parseWorkspaceEnvConfig(raw) {
16092
+ if (!isJsonObject(raw)) return void 0;
16093
+ const obj = raw;
16094
+ const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
16095
+ const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
16096
+ if (!required_commands?.length && !required_python_modules?.length) return void 0;
16097
+ return {
16098
+ ...required_commands?.length && { required_commands },
16099
+ ...required_python_modules?.length && { required_python_modules }
16509
16100
  };
16510
16101
  }
16511
16102
  function parseDockerWorkspaceConfig(raw) {
@@ -16865,7 +16456,7 @@ async function runEvaluation(options) {
16865
16456
  if (!cliModel) {
16866
16457
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
16867
16458
  }
16868
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-TXM4UEUT.js");
16459
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-MUIGGIP3.js");
16869
16460
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
16870
16461
  }
16871
16462
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -17196,6 +16787,19 @@ async function runEvaluation(options) {
17196
16787
  await dockerSetup.pullImage();
17197
16788
  setupLog("Docker image pull complete");
17198
16789
  }
16790
+ if (suiteWorkspace?.env) {
16791
+ try {
16792
+ await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
16793
+ setupLog("preflight checks passed");
16794
+ } catch (error) {
16795
+ const message = error instanceof Error ? error.message : String(error);
16796
+ if (sharedWorkspacePath && !useStaticWorkspace) {
16797
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
16798
+ });
16799
+ }
16800
+ throw new Error(message);
16801
+ }
16802
+ }
17199
16803
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
17200
16804
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
17201
16805
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -18220,7 +17824,7 @@ async function runEvalCase(options) {
18220
17824
  lastError = error;
18221
17825
  if (attempt + 1 < attemptBudget) {
18222
17826
  const delayMs = retryBackoffMs(attempt);
18223
- await sleep3(delayMs, signal);
17827
+ await sleep2(delayMs, signal);
18224
17828
  attempt += 1;
18225
17829
  continue;
18226
17830
  }
@@ -19425,7 +19029,7 @@ function extractErrorMessage(error) {
19425
19029
  function retryBackoffMs(attempt) {
19426
19030
  return Math.min(2 ** attempt * 1e3, 3e4);
19427
19031
  }
19428
- function sleep3(ms, signal) {
19032
+ function sleep2(ms, signal) {
19429
19033
  if (signal?.aborted) return Promise.resolve();
19430
19034
  return new Promise((resolve) => {
19431
19035
  const timer = setTimeout(resolve, ms);
@@ -19466,6 +19070,38 @@ function computeWeightedMean(entries) {
19466
19070
  }
19467
19071
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
19468
19072
  }
19073
+ async function runPreflightChecks(env, cwd, log) {
19074
+ const execFileAsync4 = promisify7(execFile3);
19075
+ const missing = [];
19076
+ for (const cmd of env.required_commands ?? []) {
19077
+ log(`preflight: checking command "${cmd}"`);
19078
+ try {
19079
+ if (process.platform === "win32") {
19080
+ await execFileAsync4("where", [cmd], { cwd });
19081
+ } else {
19082
+ await execFileAsync4("sh", ["-c", `command -v ${cmd}`], { cwd });
19083
+ }
19084
+ } catch {
19085
+ missing.push(`command: ${cmd}`);
19086
+ }
19087
+ }
19088
+ for (const mod of env.required_python_modules ?? []) {
19089
+ log(`preflight: checking Python module "${mod}"`);
19090
+ try {
19091
+ await execFileAsync4("python3", ["-c", `import ${mod}`], { cwd });
19092
+ } catch {
19093
+ missing.push(`python module: ${mod}`);
19094
+ }
19095
+ }
19096
+ if (missing.length > 0) {
19097
+ throw new Error(
19098
+ `Preflight checks failed \u2014 missing dependencies:
19099
+ ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
19100
+
19101
+ Install the missing dependencies before running this eval.`
19102
+ );
19103
+ }
19104
+ }
19469
19105
 
19470
19106
  // src/evaluation/providers/function-provider.ts
19471
19107
  function createFunctionProvider(taskFn) {
@@ -19954,4 +19590,4 @@ export {
19954
19590
  loadTestById,
19955
19591
  loadEvalCaseById
19956
19592
  };
19957
- //# sourceMappingURL=chunk-IXTJEXWN.js.map
19593
+ //# sourceMappingURL=chunk-EVEZQXIS.js.map