@agentv/core 4.25.1-next.1 → 4.25.2-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -223,7 +223,7 @@ function computeTraceSummary(messages) {
223
223
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
224
224
  if (summary.eventCount === 0) return void 0;
225
225
  const explorationCalls = explorationTools.reduce(
226
- (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
226
+ (sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
227
227
  0
228
228
  );
229
229
  return explorationCalls / summary.eventCount;
@@ -5187,8 +5187,17 @@ async function materializeContentForGrader(messages, getWorkDir) {
5187
5187
  }
5188
5188
  return result;
5189
5189
  }
5190
+ async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
5191
+ return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5192
+ }
5190
5193
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
5191
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5194
+ const { stdout, stderr, exitCode } = await runScriptRaw(
5195
+ scriptPath,
5196
+ input,
5197
+ agentTimeoutMs,
5198
+ cwd,
5199
+ env
5200
+ );
5192
5201
  if (exitCode !== 0) {
5193
5202
  const trimmedErr = formatStderr(stderr);
5194
5203
  throw new Error(
@@ -5306,6 +5315,8 @@ var init_code_grader = __esm({
5306
5315
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
5307
5316
  try {
5308
5317
  let stdout;
5318
+ let exitCode = 0;
5319
+ let execStderr = "";
5309
5320
  if (context2.dockerConfig) {
5310
5321
  const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
5311
5322
  const dockerProvider = new DockerWorkspaceProvider2(context2.dockerConfig);
@@ -5314,31 +5325,40 @@ var init_code_grader = __esm({
5314
5325
  stdin: inputPayload,
5315
5326
  repoCheckouts: getRepoCheckoutTargets(context2.evalCase.workspace?.repos)
5316
5327
  });
5317
- if (result.exitCode !== 0) {
5318
- const trimmedErr = result.stderr.trim();
5319
- throw new Error(
5320
- trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
5321
- );
5322
- }
5328
+ exitCode = result.exitCode;
5323
5329
  stdout = result.stdout.trim();
5330
+ execStderr = result.stderr;
5324
5331
  } else {
5325
- stdout = await executeScript(
5332
+ const result = await runScriptRaw(
5326
5333
  this.command,
5327
5334
  inputPayload,
5328
5335
  this.agentTimeoutMs,
5329
5336
  this.cwd,
5330
5337
  env
5331
5338
  );
5339
+ exitCode = result.exitCode;
5340
+ stdout = result.stdout.trim();
5341
+ execStderr = result.stderr;
5332
5342
  }
5333
- const parsed = parseJsonSafe(stdout);
5334
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5335
- const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
5343
+ const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
5344
+ const hasStderr = execStderr.trim().length > 0;
5345
+ if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
5346
+ const trimmedErr = formatStderr(execStderr);
5347
+ throw new Error(
5348
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5349
+ );
5350
+ }
5351
+ const rawParsed = parseJsonSafe(stdout);
5352
+ const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
5353
+ const passed = exitCode === 0;
5354
+ const score = parsed != null ? clampScore(typeof parsed.score === "number" ? parsed.score : 0) : passed ? 1 : 0;
5355
+ const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
5336
5356
  (a) => typeof a === "object" && a !== null && typeof a.text === "string"
5337
5357
  ).map((a) => ({
5338
5358
  text: String(a.text),
5339
5359
  passed: Boolean(a.passed),
5340
5360
  ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
5341
- })) : [];
5361
+ })) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
5342
5362
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
5343
5363
  const proxyUsage = getProxyUsage?.();
5344
5364
  const graderRawRequest = {
@@ -5646,13 +5666,6 @@ function extractImageBlocks(messages) {
5646
5666
  }
5647
5667
  return images;
5648
5668
  }
5649
- function toAiSdkImageParts(images) {
5650
- return images.map((img) => ({
5651
- type: "image",
5652
- image: img.source,
5653
- mediaType: img.media_type || void 0
5654
- }));
5655
- }
5656
5669
  function resolveSandboxed(basePath, relativePath) {
5657
5670
  const resolved = import_node_path12.default.resolve(basePath, relativePath);
5658
5671
  if (!resolved.startsWith(basePath + import_node_path12.default.sep) && resolved !== basePath) {
@@ -5661,15 +5674,24 @@ function resolveSandboxed(basePath, relativePath) {
5661
5674
  return resolved;
5662
5675
  }
5663
5676
  function createFilesystemTools(workspacePath) {
5664
- return {
5665
- list_files: (0, import_ai.tool)({
5677
+ return [
5678
+ {
5679
+ name: "list_files",
5666
5680
  description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
5667
- inputSchema: import_zod2.z.object({
5668
- path: import_zod2.z.string().describe('Relative path within workspace (use "." for root)').default(".")
5669
- }),
5681
+ parameters: {
5682
+ type: "object",
5683
+ properties: {
5684
+ path: {
5685
+ type: "string",
5686
+ description: 'Relative path within workspace (use "." for root)',
5687
+ default: "."
5688
+ }
5689
+ }
5690
+ },
5670
5691
  execute: async (input) => {
5692
+ const args = input ?? {};
5671
5693
  try {
5672
- const resolved = resolveSandboxed(workspacePath, input.path);
5694
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
5673
5695
  const entries = await import_promises12.default.readdir(resolved, { withFileTypes: true });
5674
5696
  return entries.map((e) => ({
5675
5697
  name: e.name,
@@ -5679,18 +5701,25 @@ function createFilesystemTools(workspacePath) {
5679
5701
  return { error: error instanceof Error ? error.message : String(error) };
5680
5702
  }
5681
5703
  }
5682
- }),
5683
- read_file: (0, import_ai.tool)({
5704
+ },
5705
+ {
5706
+ name: "read_file",
5684
5707
  description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
5685
- inputSchema: import_zod2.z.object({
5686
- path: import_zod2.z.string().describe("Relative path to file within workspace")
5687
- }),
5708
+ parameters: {
5709
+ type: "object",
5710
+ properties: {
5711
+ path: { type: "string", description: "Relative path to file within workspace" }
5712
+ },
5713
+ required: ["path"]
5714
+ },
5688
5715
  execute: async (input) => {
5716
+ const args = input ?? {};
5717
+ const relPath = args.path ?? "";
5689
5718
  try {
5690
- const resolved = resolveSandboxed(workspacePath, input.path);
5719
+ const resolved = resolveSandboxed(workspacePath, relPath);
5691
5720
  const stat14 = await import_promises12.default.stat(resolved);
5692
5721
  if (stat14.isDirectory()) {
5693
- return { error: `'${input.path}' is a directory, not a file` };
5722
+ return { error: `'${relPath}' is a directory, not a file` };
5694
5723
  }
5695
5724
  const buffer = Buffer.alloc(Math.min(stat14.size, MAX_FILE_SIZE));
5696
5725
  const fd = await import_promises12.default.open(resolved, "r");
@@ -5706,19 +5735,29 @@ function createFilesystemTools(workspacePath) {
5706
5735
  return { error: error instanceof Error ? error.message : String(error) };
5707
5736
  }
5708
5737
  }
5709
- }),
5710
- search_files: (0, import_ai.tool)({
5738
+ },
5739
+ {
5740
+ name: "search_files",
5711
5741
  description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
5712
- inputSchema: import_zod2.z.object({
5713
- pattern: import_zod2.z.string().describe("Regex pattern to search for"),
5714
- path: import_zod2.z.string().describe('Relative path to search within (use "." for root)').default(".")
5715
- }),
5742
+ parameters: {
5743
+ type: "object",
5744
+ properties: {
5745
+ pattern: { type: "string", description: "Regex pattern to search for" },
5746
+ path: {
5747
+ type: "string",
5748
+ description: 'Relative path to search within (use "." for root)',
5749
+ default: "."
5750
+ }
5751
+ },
5752
+ required: ["pattern"]
5753
+ },
5716
5754
  execute: async (input) => {
5755
+ const args = input ?? {};
5717
5756
  try {
5718
- const resolved = resolveSandboxed(workspacePath, input.path);
5757
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
5719
5758
  let regex;
5720
5759
  try {
5721
- regex = new RegExp(input.pattern, "gi");
5760
+ regex = new RegExp(args.pattern ?? "", "gi");
5722
5761
  } catch (regexErr) {
5723
5762
  return {
5724
5763
  error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
@@ -5731,8 +5770,8 @@ function createFilesystemTools(workspacePath) {
5731
5770
  return { error: error instanceof Error ? error.message : String(error) };
5732
5771
  }
5733
5772
  }
5734
- })
5735
- };
5773
+ }
5774
+ ];
5736
5775
  }
5737
5776
  async function searchDirectory(dirPath, workspacePath, regex, matches) {
5738
5777
  if (matches.length >= MAX_SEARCH_MATCHES) return;
@@ -5772,14 +5811,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
5772
5811
  }
5773
5812
  }
5774
5813
  }
5775
- var import_promises12, import_node_path12, import_ai, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
5814
+ var import_promises12, import_node_path12, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
5776
5815
  var init_llm_grader = __esm({
5777
5816
  "src/evaluation/graders/llm-grader.ts"() {
5778
5817
  "use strict";
5779
5818
  init_cjs_shims();
5780
5819
  import_promises12 = __toESM(require("fs/promises"), 1);
5781
5820
  import_node_path12 = __toESM(require("path"), 1);
5782
- import_ai = require("ai");
5783
5821
  import_zod2 = require("zod");
5784
5822
  init_content_preprocessor();
5785
5823
  init_content();
@@ -6095,18 +6133,15 @@ ${context2.toolCalls}`;
6095
6133
  }
6096
6134
  }
6097
6135
  // ---------------------------------------------------------------------------
6098
- // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools)
6136
+ // Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
6099
6137
  // ---------------------------------------------------------------------------
6100
6138
  /**
6101
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
6139
+ * Built-in mode: drives the grader through provider.invoke() with the
6140
+ * sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
6141
+ * provider runs the agent loop (tool call → tool execute → next model
6142
+ * turn) until the model stops requesting tools or maxSteps is hit.
6102
6143
  */
6103
6144
  async evaluateBuiltIn(context2, graderProvider) {
6104
- const model = graderProvider.asLanguageModel?.();
6105
- if (!model) {
6106
- throw new Error(
6107
- `Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
6108
- );
6109
- }
6110
6145
  const workspacePath = context2.workspacePath;
6111
6146
  if (!workspacePath) {
6112
6147
  throw new Error(
@@ -6125,18 +6160,21 @@ ${context2.toolCalls}`;
6125
6160
  maxSteps: this.maxSteps
6126
6161
  };
6127
6162
  try {
6128
- const { text, steps } = await (0, import_ai.generateText)({
6129
- model,
6130
- system: systemPrompt,
6131
- prompt: userPrompt,
6163
+ const response = await graderProvider.invoke({
6164
+ question: userPrompt,
6165
+ systemPrompt,
6166
+ evalCaseId: context2.evalCase.id,
6167
+ attempt: context2.attempt,
6168
+ temperature: this.temperature ?? 0,
6132
6169
  tools: fsTools,
6133
- stopWhen: (0, import_ai.stepCountIs)(this.maxSteps),
6134
- temperature: this.temperature ?? 0
6170
+ maxSteps: this.maxSteps
6135
6171
  });
6136
- const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
6172
+ const text = extractLastAssistantContent2(response.output);
6173
+ const stepCount = response.steps?.count ?? 1;
6174
+ const toolCallCount = response.steps?.toolCallCount ?? 0;
6137
6175
  const details = {
6138
6176
  mode: "built-in",
6139
- steps: steps.length,
6177
+ steps: stepCount,
6140
6178
  tool_calls: toolCallCount
6141
6179
  };
6142
6180
  return this.parseAgentResult(
@@ -6588,43 +6626,14 @@ ${outputSchema}`;
6588
6626
  }
6589
6627
  async generateStructuredResponse(options) {
6590
6628
  const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
6591
- const model = graderProvider.asLanguageModel?.();
6592
- if (model) {
6593
- const modelOptions = {
6594
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
6595
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
6596
- };
6597
- const hasImages = images && images.length > 0;
6598
- const result = hasImages ? await (0, import_ai.generateText)({
6599
- model,
6600
- system: systemPrompt,
6601
- messages: [
6602
- {
6603
- role: "user",
6604
- content: [
6605
- { type: "text", text: userPrompt },
6606
- ...toAiSdkImageParts(images)
6607
- ]
6608
- }
6609
- ],
6610
- ...modelOptions
6611
- }) : await (0, import_ai.generateText)({
6612
- model,
6613
- system: systemPrompt,
6614
- prompt: userPrompt,
6615
- ...modelOptions
6616
- });
6617
- const rawUsage = result.usage;
6618
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
6619
- return { text: result.text, tokenUsage };
6620
- }
6621
6629
  const response = await graderProvider.invoke({
6622
6630
  question: userPrompt,
6623
6631
  systemPrompt,
6624
6632
  evalCaseId: context2.evalCase.id,
6625
6633
  attempt: context2.attempt,
6626
6634
  maxOutputTokens: this.maxOutputTokens,
6627
- temperature: this.temperature
6635
+ temperature: this.temperature,
6636
+ ...images && images.length > 0 ? { images } : {}
6628
6637
  });
6629
6638
  return {
6630
6639
  text: extractLastAssistantContent2(response.output),
@@ -6640,12 +6649,11 @@ ${outputSchema}`;
6640
6649
  });
6641
6650
 
6642
6651
  // src/evaluation/graders/composite.ts
6643
- var import_ai2, DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
6652
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
6644
6653
  var init_composite = __esm({
6645
6654
  "src/evaluation/graders/composite.ts"() {
6646
6655
  "use strict";
6647
6656
  init_cjs_shims();
6648
- import_ai2 = require("ai");
6649
6657
  init_types2();
6650
6658
  init_code_grader();
6651
6659
  init_llm_grader();
@@ -6888,25 +6896,6 @@ Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`
6888
6896
  target: graderProvider.targetName
6889
6897
  };
6890
6898
  try {
6891
- const model = graderProvider.asLanguageModel?.();
6892
- if (model) {
6893
- const { text } = await (0, import_ai2.generateText)({
6894
- model,
6895
- system: systemPrompt,
6896
- prompt: userPrompt
6897
- });
6898
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6899
- const score2 = clampScore(data2.score);
6900
- const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
6901
- return {
6902
- score: score2,
6903
- verdict: scoreToVerdict(score2),
6904
- assertions: assertions2,
6905
- expectedAspectCount: Math.max(assertions2.length, 1),
6906
- graderRawRequest,
6907
- scores
6908
- };
6909
- }
6910
6899
  const response = await graderProvider.invoke({
6911
6900
  question: userPrompt,
6912
6901
  systemPrompt,
@@ -8689,115 +8678,254 @@ var init_graders2 = __esm({
8689
8678
  }
8690
8679
  });
8691
8680
 
8692
- // src/evaluation/providers/agentv-provider.ts
8693
- var agentv_provider_exports = {};
8694
- __export(agentv_provider_exports, {
8695
- AgentvProvider: () => AgentvProvider
8696
- });
8697
- function parseModelString(model) {
8698
- const colonIndex = model.indexOf(":");
8699
- if (colonIndex === -1) {
8700
- throw new Error(
8701
- `Invalid model string "${model}". Expected format "provider:model" (e.g., "openai:gpt-5-mini")`
8681
+ // src/evaluation/providers/llm-providers.ts
8682
+ function buildAzureBaseUrl(input) {
8683
+ const trimmed = input.replace(/\/+$/, "");
8684
+ if (trimmed.endsWith("/openai/v1")) return trimmed;
8685
+ if (trimmed.endsWith("/openai")) return `${trimmed}/v1`;
8686
+ return `${trimmed}/openai/v1`;
8687
+ }
8688
+ async function invokePiAi(options) {
8689
+ const { model, apiKey, request, defaults, retryConfig, providerOptions } = options;
8690
+ const tools = request.tools && request.tools.length > 0 ? request.tools : void 0;
8691
+ const maxSteps = tools ? Math.max(1, request.maxSteps ?? 1) : 1;
8692
+ const { systemPrompt, messages } = chatPromptToPiContext(buildChatPrompt(request));
8693
+ if (request.images && request.images.length > 0) {
8694
+ attachImagesToLastUserMessage(messages, request.images);
8695
+ }
8696
+ const piTools = tools ? tools.map((t) => ({
8697
+ name: t.name,
8698
+ description: t.description,
8699
+ parameters: t.parameters
8700
+ })) : void 0;
8701
+ const ctx = { systemPrompt, messages, ...piTools ? { tools: piTools } : {} };
8702
+ const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
8703
+ const callOptions = {
8704
+ ...apiKey !== void 0 ? { apiKey } : {},
8705
+ temperature,
8706
+ ...maxOutputTokens !== void 0 ? { maxTokens: maxOutputTokens } : {},
8707
+ signal: request.signal,
8708
+ ...providerOptions ?? {}
8709
+ };
8710
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
8711
+ const startMs = Date.now();
8712
+ const aggregateUsage = { input: 0, output: 0, cacheRead: 0, cost: 0 };
8713
+ let stepCount = 0;
8714
+ let toolCallCount = 0;
8715
+ let result = await withRetry(
8716
+ () => (0, import_pi_ai.complete)(model, ctx, callOptions),
8717
+ retryConfig,
8718
+ request.signal
8719
+ );
8720
+ ctx.messages.push(result);
8721
+ stepCount = 1;
8722
+ accumulateUsage(aggregateUsage, result.usage);
8723
+ while (tools) {
8724
+ const calls = result.content.filter(
8725
+ (b) => b.type === "toolCall"
8726
+ );
8727
+ if (calls.length === 0) break;
8728
+ if (stepCount >= maxSteps) break;
8729
+ toolCallCount += calls.length;
8730
+ for (const call of calls) {
8731
+ const tool = tools.find((t) => t.name === call.name);
8732
+ let output;
8733
+ let isError = false;
8734
+ try {
8735
+ if (!tool) {
8736
+ throw new Error(`pi-ai adapter: model called unknown tool '${call.name}'`);
8737
+ }
8738
+ output = await tool.execute(call.arguments);
8739
+ } catch (err) {
8740
+ output = err instanceof Error ? err.message : String(err);
8741
+ isError = true;
8742
+ }
8743
+ ctx.messages.push({
8744
+ role: "toolResult",
8745
+ toolCallId: call.id,
8746
+ toolName: call.name,
8747
+ content: [
8748
+ { type: "text", text: typeof output === "string" ? output : JSON.stringify(output) }
8749
+ ],
8750
+ isError,
8751
+ timestamp: Date.now()
8752
+ });
8753
+ }
8754
+ result = await withRetry(
8755
+ () => (0, import_pi_ai.complete)(model, ctx, callOptions),
8756
+ retryConfig,
8757
+ request.signal
8702
8758
  );
8759
+ ctx.messages.push(result);
8760
+ stepCount += 1;
8761
+ accumulateUsage(aggregateUsage, result.usage);
8703
8762
  }
8704
- return {
8705
- provider: model.slice(0, colonIndex),
8706
- modelName: model.slice(colonIndex + 1)
8707
- };
8763
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
8764
+ const durationMs = Date.now() - startMs;
8765
+ return mapPiResponse(result, {
8766
+ durationMs,
8767
+ startTime,
8768
+ endTime,
8769
+ aggregateUsage,
8770
+ steps: tools ? { count: stepCount, toolCallCount } : void 0
8771
+ });
8708
8772
  }
8709
- function createLanguageModel(modelString) {
8710
- const { provider, modelName } = parseModelString(modelString);
8711
- switch (provider) {
8712
- case "openai":
8713
- return (0, import_openai.createOpenAI)()(modelName);
8714
- case "anthropic":
8715
- return (0, import_anthropic.createAnthropic)()(modelName);
8716
- case "azure":
8717
- return (0, import_azure.createAzure)().chat(modelName);
8718
- case "google":
8719
- return (0, import_google.createGoogleGenerativeAI)()(modelName);
8720
- default:
8773
+ function accumulateUsage(agg, u) {
8774
+ agg.input += u.input;
8775
+ agg.output += u.output;
8776
+ agg.cacheRead += u.cacheRead;
8777
+ agg.cost += u.cost.total;
8778
+ }
8779
+ function resolvePiModel(args) {
8780
+ const { providerName, apiId, modelId, baseUrl } = args;
8781
+ let model;
8782
+ try {
8783
+ model = (0, import_pi_ai.getModel)(providerName, modelId);
8784
+ } catch {
8785
+ model = void 0;
8786
+ }
8787
+ if (!model) {
8788
+ const fallbackBaseUrl = baseUrl ?? defaultBaseUrlFor(providerName);
8789
+ if (!fallbackBaseUrl) {
8721
8790
  throw new Error(
8722
- `Unsupported AI SDK provider "${provider}" in model string "${modelString}". Supported providers: openai, anthropic, azure, google`
8791
+ `pi-ai adapter cannot resolve a baseUrl for provider '${providerName}' / model '${modelId}'. Either set the target's baseUrl/endpoint or use a model id pi-ai recognizes.`
8723
8792
  );
8724
- }
8725
- }
8726
- var import_anthropic, import_azure, import_google, import_openai, AgentvProvider;
8727
- var init_agentv_provider = __esm({
8728
- "src/evaluation/providers/agentv-provider.ts"() {
8729
- "use strict";
8730
- init_cjs_shims();
8731
- import_anthropic = require("@ai-sdk/anthropic");
8732
- import_azure = require("@ai-sdk/azure");
8733
- import_google = require("@ai-sdk/google");
8734
- import_openai = require("@ai-sdk/openai");
8735
- AgentvProvider = class {
8736
- id;
8737
- kind = "agentv";
8738
- targetName;
8739
- model;
8740
- constructor(targetName, config) {
8741
- this.id = `agentv:${targetName}`;
8742
- this.targetName = targetName;
8743
- this.model = createLanguageModel(config.model);
8744
- }
8745
- /**
8746
- * Direct invoke is not supported for the agentv provider.
8747
- * Use asLanguageModel() with generateText() instead.
8748
- */
8749
- async invoke(_request) {
8750
- throw new Error(
8751
- "AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead."
8752
- );
8753
- }
8754
- /**
8755
- * Returns the resolved AI SDK LanguageModel for use with generateText/generateObject.
8756
- */
8757
- asLanguageModel() {
8758
- return this.model;
8759
- }
8793
+ }
8794
+ model = {
8795
+ id: modelId,
8796
+ name: modelId,
8797
+ api: apiId,
8798
+ provider: providerName,
8799
+ baseUrl: fallbackBaseUrl,
8800
+ reasoning: false,
8801
+ input: ["text"],
8802
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
8803
+ contextWindow: 128e3,
8804
+ maxTokens: 16384
8760
8805
  };
8761
8806
  }
8762
- });
8763
-
8764
- // src/evaluation/providers/ai-sdk.ts
8765
- function buildAzureOptions(config) {
8766
- const options = {
8767
- apiKey: config.apiKey,
8768
- apiVersion: config.version,
8769
- // Chat completions still use deployment-scoped Azure URLs for compatibility
8770
- // with existing deployments. Responses API should use the SDK's v1 path.
8771
- useDeploymentBasedUrls: config.apiFormat !== "responses"
8772
- };
8773
- const baseURL = normalizeAzureBaseUrl(config.resourceName);
8774
- if (baseURL) {
8775
- options.baseURL = baseURL;
8776
- } else {
8777
- options.resourceName = config.resourceName;
8807
+ if (model.api !== apiId) {
8808
+ model = { ...model, api: apiId };
8809
+ }
8810
+ if (baseUrl) {
8811
+ model = { ...model, baseUrl };
8778
8812
  }
8779
- return options;
8813
+ return model;
8780
8814
  }
8781
- function normalizeAzureBaseUrl(resourceName) {
8782
- const trimmed = resourceName.trim();
8783
- if (!/^https?:\/\//i.test(trimmed)) {
8784
- return void 0;
8815
+ function defaultBaseUrlFor(providerName) {
8816
+ if (providerName === "openai") return "https://api.openai.com/v1";
8817
+ if (providerName === "openrouter") return "https://openrouter.ai/api/v1";
8818
+ return void 0;
8819
+ }
8820
+ function chatPromptToPiContext(chatPrompt) {
8821
+ const systemSegments = [];
8822
+ const messages = [];
8823
+ const now = Date.now();
8824
+ for (const message of chatPrompt) {
8825
+ if (message.role === "system") {
8826
+ systemSegments.push(message.content);
8827
+ continue;
8828
+ }
8829
+ if (message.role === "user") {
8830
+ messages.push({ role: "user", content: message.content, timestamp: now });
8831
+ continue;
8832
+ }
8833
+ if (message.role === "assistant") {
8834
+ messages.push({
8835
+ role: "assistant",
8836
+ content: [{ type: "text", text: message.content }],
8837
+ api: "",
8838
+ provider: "",
8839
+ model: "",
8840
+ usage: {
8841
+ input: 0,
8842
+ output: 0,
8843
+ cacheRead: 0,
8844
+ cacheWrite: 0,
8845
+ totalTokens: 0,
8846
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
8847
+ },
8848
+ stopReason: "stop",
8849
+ timestamp: now
8850
+ });
8851
+ continue;
8852
+ }
8853
+ if (message.role === "tool" || message.role === "function") {
8854
+ const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
8855
+ messages.push({
8856
+ role: "assistant",
8857
+ content: [{ type: "text", text: `${prefix}${message.content}` }],
8858
+ api: "",
8859
+ provider: "",
8860
+ model: "",
8861
+ usage: {
8862
+ input: 0,
8863
+ output: 0,
8864
+ cacheRead: 0,
8865
+ cacheWrite: 0,
8866
+ totalTokens: 0,
8867
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
8868
+ },
8869
+ stopReason: "stop",
8870
+ timestamp: now
8871
+ });
8872
+ continue;
8873
+ }
8874
+ throw new Error(`pi-ai adapter received unsupported message role '${message.role}'.`);
8785
8875
  }
8786
- const withoutSlash = trimmed.replace(/\/+$/, "");
8787
- const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
8788
- return normalized;
8876
+ return {
8877
+ systemPrompt: systemSegments.length > 0 ? systemSegments.join("\n\n") : void 0,
8878
+ messages
8879
+ };
8789
8880
  }
8790
- function buildAnthropicProviderOptions(defaults) {
8791
- if (defaults.thinkingBudget === void 0) {
8792
- return void 0;
8881
+ function attachImagesToLastUserMessage(messages, images) {
8882
+ if (!images || images.length === 0) return;
8883
+ for (let i = messages.length - 1; i >= 0; i--) {
8884
+ const m = messages[i];
8885
+ if (m.role !== "user") continue;
8886
+ const text = typeof m.content === "string" ? m.content : "";
8887
+ messages[i] = {
8888
+ ...m,
8889
+ content: [
8890
+ ...text ? [{ type: "text", text }] : [],
8891
+ ...images.map((img) => ({
8892
+ type: "image",
8893
+ data: img.source,
8894
+ mimeType: img.media_type
8895
+ }))
8896
+ ]
8897
+ };
8898
+ return;
8793
8899
  }
8900
+ messages.push({
8901
+ role: "user",
8902
+ content: images.map((img) => ({
8903
+ type: "image",
8904
+ data: img.source,
8905
+ mimeType: img.media_type
8906
+ })),
8907
+ timestamp: Date.now()
8908
+ });
8909
+ }
8910
+ function mapPiResponse(result, timing) {
8911
+ const text = result.content.filter((b) => b.type === "text").map((b) => b.text).join("");
8912
+ const cached = timing.aggregateUsage.cacheRead > 0 ? timing.aggregateUsage.cacheRead : void 0;
8913
+ const tokenUsage = {
8914
+ input: timing.aggregateUsage.input,
8915
+ output: timing.aggregateUsage.output,
8916
+ ...cached !== void 0 ? { cached } : {}
8917
+ };
8918
+ const costUsd = timing.aggregateUsage.cost > 0 ? timing.aggregateUsage.cost : void 0;
8794
8919
  return {
8795
- anthropic: {
8796
- thinking: {
8797
- type: "enabled",
8798
- budgetTokens: defaults.thinkingBudget
8799
- }
8800
- }
8920
+ raw: result,
8921
+ usage: toJsonObject(result.usage),
8922
+ output: [{ role: "assistant", content: text }],
8923
+ tokenUsage,
8924
+ ...costUsd !== void 0 ? { costUsd } : {},
8925
+ durationMs: timing.durationMs,
8926
+ startTime: timing.startTime,
8927
+ endTime: timing.endTime,
8928
+ ...timing.steps ? { steps: timing.steps } : {}
8801
8929
  };
8802
8930
  }
8803
8931
  function buildChatPrompt(request) {
@@ -8812,92 +8940,21 @@ function buildChatPrompt(request) {
8812
8940
  }
8813
8941
  const systemContent = resolveSystemContent(request);
8814
8942
  const userContent = request.question.trim();
8815
- const prompt = [
8943
+ return [
8816
8944
  { role: "system", content: systemContent },
8817
8945
  { role: "user", content: userContent }
8818
8946
  ];
8819
- return prompt;
8820
8947
  }
8821
8948
  function resolveSystemContent(request) {
8822
- const systemSegments = [];
8823
8949
  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
8824
- systemSegments.push(request.systemPrompt.trim());
8825
- } else {
8826
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
8950
+ return request.systemPrompt.trim();
8827
8951
  }
8828
- return systemSegments.join("\n\n");
8829
- }
8830
- function toModelMessages(chatPrompt) {
8831
- return chatPrompt.map((message) => {
8832
- if (message.role === "tool" || message.role === "function") {
8833
- const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
8834
- return {
8835
- role: "assistant",
8836
- content: `${prefix}${message.content}`
8837
- };
8838
- }
8839
- if (message.role === "assistant" || message.role === "system" || message.role === "user") {
8840
- return {
8841
- role: message.role,
8842
- content: message.content
8843
- };
8844
- }
8845
- return {
8846
- role: "user",
8847
- content: message.content
8848
- };
8849
- });
8952
+ return DEFAULT_SYSTEM_PROMPT;
8850
8953
  }
8851
8954
  function resolveModelSettings(request, defaults) {
8852
- const temperature = request.temperature ?? defaults.temperature;
8853
- const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
8854
8955
  return {
8855
- temperature,
8856
- maxOutputTokens
8857
- };
8858
- }
8859
- async function invokeModel(options) {
8860
- const { model, request, defaults, retryConfig, providerOptions } = options;
8861
- const chatPrompt = buildChatPrompt(request);
8862
- const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
8863
- const startTime = (/* @__PURE__ */ new Date()).toISOString();
8864
- const startMs = Date.now();
8865
- const result = await withRetry(
8866
- () => (0, import_ai3.generateText)({
8867
- model,
8868
- messages: toModelMessages(chatPrompt),
8869
- temperature,
8870
- maxOutputTokens,
8871
- maxRetries: 0,
8872
- abortSignal: request.signal,
8873
- ...providerOptions ? { providerOptions } : {}
8874
- }),
8875
- retryConfig,
8876
- request.signal
8877
- );
8878
- const endTime = (/* @__PURE__ */ new Date()).toISOString();
8879
- const durationMs = Date.now() - startMs;
8880
- return mapResponse(result, { durationMs, startTime, endTime });
8881
- }
8882
- function mapResponse(result, timing) {
8883
- const content = result.text ?? "";
8884
- const rawUsage = result.totalUsage ?? result.usage;
8885
- const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
8886
- const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
8887
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
8888
- input: rawUsage.inputTokens,
8889
- output: rawUsage.outputTokens,
8890
- ...reasoning != null ? { reasoning } : {},
8891
- ...cached != null ? { cached } : {}
8892
- } : void 0;
8893
- return {
8894
- raw: result,
8895
- usage: toJsonObject(rawUsage),
8896
- output: [{ role: "assistant", content }],
8897
- tokenUsage,
8898
- durationMs: timing?.durationMs,
8899
- startTime: timing?.startTime,
8900
- endTime: timing?.endTime
8956
+ temperature: request.temperature ?? defaults.temperature,
8957
+ maxOutputTokens: request.maxOutputTokens ?? defaults.maxOutputTokens
8901
8958
  };
8902
8959
  }
8903
8960
  function toJsonObject(value) {
@@ -8911,9 +8968,7 @@ function toJsonObject(value) {
8911
8968
  }
8912
8969
  }
8913
8970
  function extractStatus(error) {
8914
- if (!error || typeof error !== "object") {
8915
- return void 0;
8916
- }
8971
+ if (!error || typeof error !== "object") return void 0;
8917
8972
  const candidate = error;
8918
8973
  const directStatus = candidate.status ?? candidate.statusCode;
8919
8974
  if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
@@ -8928,21 +8983,15 @@ function extractStatus(error) {
8928
8983
  const match = message.match(/HTTP\s+(\d{3})/i);
8929
8984
  if (match) {
8930
8985
  const parsed = Number.parseInt(match[1], 10);
8931
- if (Number.isFinite(parsed)) {
8932
- return parsed;
8933
- }
8986
+ if (Number.isFinite(parsed)) return parsed;
8934
8987
  }
8935
8988
  }
8936
8989
  return void 0;
8937
8990
  }
8938
8991
  function isNetworkError(error) {
8939
- if (!error || typeof error !== "object") {
8940
- return false;
8941
- }
8992
+ if (!error || typeof error !== "object") return false;
8942
8993
  const candidate = error;
8943
- if (candidate.name === "AbortError") {
8944
- return false;
8945
- }
8994
+ if (candidate.name === "AbortError") return false;
8946
8995
  const code = candidate.code;
8947
8996
  if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
8948
8997
  return true;
@@ -8955,12 +9004,8 @@ function isNetworkError(error) {
8955
9004
  }
8956
9005
  function isRetryableError(error, retryableStatusCodes) {
8957
9006
  const status = extractStatus(error);
8958
- if (status === 401 || status === 403) {
8959
- return false;
8960
- }
8961
- if (typeof status === "number") {
8962
- return retryableStatusCodes.includes(status);
8963
- }
9007
+ if (status === 401 || status === 403) return false;
9008
+ if (typeof status === "number") return retryableStatusCodes.includes(status);
8964
9009
  return isNetworkError(error);
8965
9010
  }
8966
9011
  function calculateRetryDelay(attempt, config) {
@@ -8990,195 +9035,266 @@ async function withRetry(fn, retryConfig, signal) {
8990
9035
  return await fn();
8991
9036
  } catch (error) {
8992
9037
  lastError = error;
8993
- if (attempt >= config.maxRetries) {
8994
- break;
8995
- }
8996
- if (!isRetryableError(error, config.retryableStatusCodes)) {
8997
- throw error;
8998
- }
9038
+ if (attempt >= config.maxRetries) break;
9039
+ if (!isRetryableError(error, config.retryableStatusCodes)) throw error;
8999
9040
  const delay = calculateRetryDelay(attempt, config);
9000
9041
  await sleep(delay);
9001
9042
  }
9002
9043
  }
9003
9044
  throw lastError;
9004
9045
  }
9005
- var import_anthropic2, import_azure2, import_google2, import_openai2, import_ai_sdk_provider, import_ai3, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, AzureProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider;
9006
- var init_ai_sdk = __esm({
9007
- "src/evaluation/providers/ai-sdk.ts"() {
9046
+ var import_pi_ai, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider, AzureProvider;
9047
+ var init_llm_providers = __esm({
9048
+ "src/evaluation/providers/llm-providers.ts"() {
9008
9049
  "use strict";
9009
9050
  init_cjs_shims();
9010
- import_anthropic2 = require("@ai-sdk/anthropic");
9011
- import_azure2 = require("@ai-sdk/azure");
9012
- import_google2 = require("@ai-sdk/google");
9013
- import_openai2 = require("@ai-sdk/openai");
9014
- import_ai_sdk_provider = require("@openrouter/ai-sdk-provider");
9015
- import_ai3 = require("ai");
9051
+ import_pi_ai = require("@mariozechner/pi-ai");
9052
+ (0, import_pi_ai.registerBuiltInApiProviders)();
9016
9053
  DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
9017
9054
  OpenAIProvider = class {
9018
- constructor(targetName, config) {
9019
- this.config = config;
9020
- this.id = `openai:${targetName}`;
9021
- this.targetName = targetName;
9022
- this.defaults = {
9023
- temperature: config.temperature,
9024
- maxOutputTokens: config.maxOutputTokens
9025
- };
9026
- this.retryConfig = config.retry;
9027
- const openai = (0, import_openai2.createOpenAI)({
9028
- apiKey: config.apiKey,
9029
- baseURL: config.baseURL
9030
- });
9031
- this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
9032
- }
9033
9055
  id;
9034
9056
  kind = "openai";
9035
9057
  targetName;
9036
- model;
9058
+ piModel;
9037
9059
  defaults;
9038
9060
  retryConfig;
9039
- async invoke(request) {
9040
- return invokeModel({
9041
- model: this.model,
9042
- request,
9043
- defaults: this.defaults,
9044
- retryConfig: this.retryConfig
9045
- });
9046
- }
9047
- asLanguageModel() {
9048
- return this.model;
9049
- }
9050
- };
9051
- AzureProvider = class {
9061
+ apiKey;
9052
9062
  constructor(targetName, config) {
9053
- this.config = config;
9054
- this.id = `azure:${targetName}`;
9063
+ this.id = `openai:${targetName}`;
9055
9064
  this.targetName = targetName;
9065
+ this.apiKey = config.apiKey;
9056
9066
  this.defaults = {
9057
9067
  temperature: config.temperature,
9058
9068
  maxOutputTokens: config.maxOutputTokens
9059
9069
  };
9060
9070
  this.retryConfig = config.retry;
9061
- const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
9062
- this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
9071
+ this.piModel = resolvePiModel({
9072
+ providerName: "openai",
9073
+ apiId: config.apiFormat === "responses" ? "openai-responses" : "openai-completions",
9074
+ modelId: config.model,
9075
+ baseUrl: config.baseURL
9076
+ });
9063
9077
  }
9064
- id;
9065
- kind = "azure";
9066
- targetName;
9067
- model;
9068
- defaults;
9069
- retryConfig;
9070
9078
  async invoke(request) {
9071
- return invokeModel({
9072
- model: this.model,
9079
+ return invokePiAi({
9080
+ model: this.piModel,
9081
+ apiKey: this.apiKey,
9073
9082
  request,
9074
9083
  defaults: this.defaults,
9075
9084
  retryConfig: this.retryConfig
9076
9085
  });
9077
9086
  }
9078
- asLanguageModel() {
9079
- return this.model;
9080
- }
9081
9087
  };
9082
9088
  OpenRouterProvider = class {
9089
+ id;
9090
+ kind = "openrouter";
9091
+ targetName;
9092
+ piModel;
9093
+ defaults;
9094
+ retryConfig;
9095
+ apiKey;
9083
9096
  constructor(targetName, config) {
9084
- this.config = config;
9085
9097
  this.id = `openrouter:${targetName}`;
9086
9098
  this.targetName = targetName;
9099
+ this.apiKey = config.apiKey;
9087
9100
  this.defaults = {
9088
9101
  temperature: config.temperature,
9089
9102
  maxOutputTokens: config.maxOutputTokens
9090
9103
  };
9091
9104
  this.retryConfig = config.retry;
9092
- const openrouter = (0, import_ai_sdk_provider.createOpenRouter)({
9093
- apiKey: config.apiKey
9105
+ this.piModel = resolvePiModel({
9106
+ providerName: "openrouter",
9107
+ apiId: "openai-completions",
9108
+ modelId: config.model,
9109
+ baseUrl: "https://openrouter.ai/api/v1"
9094
9110
  });
9095
- this.model = openrouter(config.model);
9096
9111
  }
9097
- id;
9098
- kind = "openrouter";
9099
- targetName;
9100
- model;
9101
- defaults;
9102
- retryConfig;
9103
9112
  async invoke(request) {
9104
- return invokeModel({
9105
- model: this.model,
9113
+ return invokePiAi({
9114
+ model: this.piModel,
9115
+ apiKey: this.apiKey,
9106
9116
  request,
9107
9117
  defaults: this.defaults,
9108
9118
  retryConfig: this.retryConfig
9109
9119
  });
9110
9120
  }
9111
- asLanguageModel() {
9112
- return this.model;
9113
- }
9114
9121
  };
9115
9122
  AnthropicProvider = class {
9123
+ id;
9124
+ kind = "anthropic";
9125
+ targetName;
9126
+ piModel;
9127
+ defaults;
9128
+ retryConfig;
9129
+ apiKey;
9130
+ thinkingBudget;
9116
9131
  constructor(targetName, config) {
9117
- this.config = config;
9118
9132
  this.id = `anthropic:${targetName}`;
9119
9133
  this.targetName = targetName;
9134
+ this.apiKey = config.apiKey;
9135
+ this.thinkingBudget = config.thinkingBudget;
9120
9136
  this.defaults = {
9121
9137
  temperature: config.temperature,
9122
9138
  maxOutputTokens: config.maxOutputTokens,
9123
9139
  thinkingBudget: config.thinkingBudget
9124
9140
  };
9125
9141
  this.retryConfig = config.retry;
9126
- const anthropic = (0, import_anthropic2.createAnthropic)({
9127
- apiKey: config.apiKey
9142
+ this.piModel = resolvePiModel({
9143
+ providerName: "anthropic",
9144
+ apiId: "anthropic-messages",
9145
+ modelId: config.model
9128
9146
  });
9129
- this.model = anthropic(config.model);
9130
9147
  }
9131
- id;
9132
- kind = "anthropic";
9133
- targetName;
9134
- model;
9135
- defaults;
9136
- retryConfig;
9137
9148
  async invoke(request) {
9138
- const providerOptions = buildAnthropicProviderOptions(this.defaults);
9139
- return invokeModel({
9140
- model: this.model,
9149
+ const providerOptions = this.thinkingBudget !== void 0 ? { thinkingEnabled: true, thinkingBudgetTokens: this.thinkingBudget } : void 0;
9150
+ return invokePiAi({
9151
+ model: this.piModel,
9152
+ apiKey: this.apiKey,
9141
9153
  request,
9142
9154
  defaults: this.defaults,
9143
9155
  retryConfig: this.retryConfig,
9144
- providerOptions
9156
+ ...providerOptions ? { providerOptions } : {}
9145
9157
  });
9146
9158
  }
9147
- asLanguageModel() {
9148
- return this.model;
9149
- }
9150
9159
  };
9151
9160
  GeminiProvider = class {
9161
+ id;
9162
+ kind = "gemini";
9163
+ targetName;
9164
+ piModel;
9165
+ defaults;
9166
+ retryConfig;
9167
+ apiKey;
9152
9168
  constructor(targetName, config) {
9153
- this.config = config;
9154
9169
  this.id = `gemini:${targetName}`;
9155
9170
  this.targetName = targetName;
9171
+ this.apiKey = config.apiKey;
9156
9172
  this.defaults = {
9157
9173
  temperature: config.temperature,
9158
9174
  maxOutputTokens: config.maxOutputTokens
9159
9175
  };
9160
9176
  this.retryConfig = config.retry;
9161
- const google = (0, import_google2.createGoogleGenerativeAI)({
9162
- apiKey: config.apiKey
9177
+ this.piModel = resolvePiModel({
9178
+ providerName: "google",
9179
+ apiId: "google-generative-ai",
9180
+ modelId: config.model
9181
+ });
9182
+ }
9183
+ async invoke(request) {
9184
+ return invokePiAi({
9185
+ model: this.piModel,
9186
+ apiKey: this.apiKey,
9187
+ request,
9188
+ defaults: this.defaults,
9189
+ retryConfig: this.retryConfig
9163
9190
  });
9164
- this.model = google(config.model);
9165
9191
  }
9192
+ };
9193
+ AzureProvider = class {
9166
9194
  id;
9167
- kind = "gemini";
9195
+ kind = "azure";
9168
9196
  targetName;
9169
- model;
9197
+ piModel;
9170
9198
  defaults;
9171
9199
  retryConfig;
9200
+ apiKey;
9201
+ providerOptions;
9202
+ constructor(targetName, config) {
9203
+ this.id = `azure:${targetName}`;
9204
+ this.targetName = targetName;
9205
+ this.apiKey = config.apiKey;
9206
+ this.defaults = {
9207
+ temperature: config.temperature,
9208
+ maxOutputTokens: config.maxOutputTokens
9209
+ };
9210
+ this.retryConfig = config.retry;
9211
+ const trimmed = config.resourceName.trim();
9212
+ const isFullUrl = /^https?:\/\//i.test(trimmed);
9213
+ const baseUrl = isFullUrl ? buildAzureBaseUrl(trimmed) : void 0;
9214
+ this.providerOptions = {
9215
+ ...baseUrl ? { azureBaseUrl: baseUrl } : { azureResourceName: trimmed },
9216
+ ...config.version ? { azureApiVersion: config.version } : {}
9217
+ };
9218
+ this.piModel = resolvePiModel({
9219
+ providerName: "azure-openai-responses",
9220
+ apiId: "azure-openai-responses",
9221
+ // The "model id" for Azure is the deployment name.
9222
+ modelId: config.deploymentName,
9223
+ ...baseUrl ? { baseUrl } : {}
9224
+ });
9225
+ }
9172
9226
  async invoke(request) {
9173
- return invokeModel({
9174
- model: this.model,
9227
+ return invokePiAi({
9228
+ model: this.piModel,
9229
+ apiKey: this.apiKey,
9175
9230
  request,
9176
9231
  defaults: this.defaults,
9177
- retryConfig: this.retryConfig
9232
+ retryConfig: this.retryConfig,
9233
+ providerOptions: this.providerOptions
9178
9234
  });
9179
9235
  }
9180
- asLanguageModel() {
9181
- return this.model;
9236
+ };
9237
+ }
9238
+ });
9239
+
9240
+ // src/evaluation/providers/agentv-provider.ts
9241
+ var agentv_provider_exports = {};
9242
+ __export(agentv_provider_exports, {
9243
+ AgentvProvider: () => AgentvProvider
9244
+ });
9245
+ function parseAgentvModel(model) {
9246
+ const colonIndex = model.indexOf(":");
9247
+ if (colonIndex === -1) {
9248
+ throw new Error(
9249
+ `Invalid agentv model "${model}". Expected "provider:model" (e.g., "openai:gpt-5-mini").`
9250
+ );
9251
+ }
9252
+ const provider = model.slice(0, colonIndex);
9253
+ const modelId = model.slice(colonIndex + 1);
9254
+ switch (provider) {
9255
+ case "openai":
9256
+ return { providerName: "openai", apiId: "openai-completions", modelId };
9257
+ case "anthropic":
9258
+ return { providerName: "anthropic", apiId: "anthropic-messages", modelId };
9259
+ case "azure":
9260
+ return {
9261
+ providerName: "azure-openai-responses",
9262
+ apiId: "azure-openai-responses",
9263
+ modelId
9264
+ };
9265
+ case "google":
9266
+ return { providerName: "google", apiId: "google-generative-ai", modelId };
9267
+ default:
9268
+ throw new Error(
9269
+ `Unsupported agentv provider "${provider}" in "${model}". Supported: openai, anthropic, azure, google.`
9270
+ );
9271
+ }
9272
+ }
9273
+ var AgentvProvider;
9274
+ var init_agentv_provider = __esm({
9275
+ "src/evaluation/providers/agentv-provider.ts"() {
9276
+ "use strict";
9277
+ init_cjs_shims();
9278
+ init_llm_providers();
9279
+ AgentvProvider = class {
9280
+ id;
9281
+ kind = "agentv";
9282
+ targetName;
9283
+ piModel;
9284
+ defaults;
9285
+ constructor(targetName, config) {
9286
+ this.id = `agentv:${targetName}`;
9287
+ this.targetName = targetName;
9288
+ const { providerName, apiId, modelId } = parseAgentvModel(config.model);
9289
+ this.piModel = resolvePiModel({ providerName, apiId, modelId });
9290
+ this.defaults = { temperature: config.temperature };
9291
+ }
9292
+ async invoke(request) {
9293
+ return invokePiAi({
9294
+ model: this.piModel,
9295
+ request,
9296
+ defaults: this.defaults
9297
+ });
9182
9298
  }
9183
9299
  };
9184
9300
  }
@@ -13381,10 +13497,10 @@ function extractToolCallsFromEvents(events) {
13381
13497
  }
13382
13498
  }
13383
13499
  const toolCalls = [];
13384
- for (const [id, { tool: tool2, input }] of starts) {
13500
+ for (const [id, { tool, input }] of starts) {
13385
13501
  toolCalls.push(
13386
13502
  normalizeToolCall("pi-cli", {
13387
- tool: tool2,
13503
+ tool,
13388
13504
  input,
13389
13505
  id: id.startsWith("anon-") ? void 0 : id,
13390
13506
  output: results.get(id)
@@ -17765,7 +17881,6 @@ var init_providers = __esm({
17765
17881
  "use strict";
17766
17882
  init_cjs_shims();
17767
17883
  init_agentv_provider();
17768
- init_ai_sdk();
17769
17884
  init_claude_cli();
17770
17885
  init_claude_sdk();
17771
17886
  init_cli();
@@ -17773,6 +17888,7 @@ var init_providers = __esm({
17773
17888
  init_copilot_cli();
17774
17889
  init_copilot_log();
17775
17890
  init_copilot_sdk();
17891
+ init_llm_providers();
17776
17892
  init_mock();
17777
17893
  init_pi_cli();
17778
17894
  init_pi_coding_agent();
@@ -19799,6 +19915,19 @@ async function runEvaluation(options) {
19799
19915
  await dockerSetup.pullImage();
19800
19916
  setupLog("Docker image pull complete");
19801
19917
  }
19918
+ if (suiteWorkspace?.env) {
19919
+ try {
19920
+ await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
19921
+ setupLog("preflight checks passed");
19922
+ } catch (error) {
19923
+ const message = error instanceof Error ? error.message : String(error);
19924
+ if (sharedWorkspacePath && !useStaticWorkspace) {
19925
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
19926
+ });
19927
+ }
19928
+ throw new Error(message);
19929
+ }
19930
+ }
19802
19931
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
19803
19932
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
19804
19933
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -22069,6 +22198,38 @@ function computeWeightedMean(entries) {
22069
22198
  }
22070
22199
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
22071
22200
  }
22201
+ async function runPreflightChecks(env, cwd, log) {
22202
+ const execFileAsync5 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
22203
+ const missing = [];
22204
+ for (const cmd of env.required_commands ?? []) {
22205
+ log(`preflight: checking command "${cmd}"`);
22206
+ try {
22207
+ if (process.platform === "win32") {
22208
+ await execFileAsync5("where", [cmd], { cwd });
22209
+ } else {
22210
+ await execFileAsync5("sh", ["-c", `command -v ${cmd}`], { cwd });
22211
+ }
22212
+ } catch {
22213
+ missing.push(`command: ${cmd}`);
22214
+ }
22215
+ }
22216
+ for (const mod of env.required_python_modules ?? []) {
22217
+ log(`preflight: checking Python module "${mod}"`);
22218
+ try {
22219
+ await execFileAsync5("python3", ["-c", `import ${mod}`], { cwd });
22220
+ } catch {
22221
+ missing.push(`python module: ${mod}`);
22222
+ }
22223
+ }
22224
+ if (missing.length > 0) {
22225
+ throw new Error(
22226
+ `Preflight checks failed \u2014 missing dependencies:
22227
+ ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
22228
+
22229
+ Install the missing dependencies before running this eval.`
22230
+ );
22231
+ }
22232
+ }
22072
22233
  var import_node_child_process11, import_node_crypto11, import_node_fs16, import_promises35, import_node_path47, import_node_util7, import_micromatch2, execFileAsync3, WORKSPACE_GIT_TIMEOUT_MS;
22073
22234
  var init_orchestrator = __esm({
22074
22235
  "src/evaluation/orchestrator.ts"() {
@@ -22931,7 +23092,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
22931
23092
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
22932
23093
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
22933
23094
  const docker = parseDockerWorkspaceConfig(obj.docker);
22934
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
23095
+ const env = parseWorkspaceEnvConfig(obj.env);
23096
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
22935
23097
  return void 0;
22936
23098
  return {
22937
23099
  ...template !== void 0 && { template },
@@ -22940,7 +23102,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
22940
23102
  ...hooks !== void 0 && { hooks },
22941
23103
  ...mode !== void 0 && { mode },
22942
23104
  ...workspacePath !== void 0 && { path: workspacePath },
22943
- ...docker !== void 0 && { docker }
23105
+ ...docker !== void 0 && { docker },
23106
+ ...env !== void 0 && { env }
23107
+ };
23108
+ }
23109
+ function parseWorkspaceEnvConfig(raw) {
23110
+ if (!isJsonObject(raw)) return void 0;
23111
+ const obj = raw;
23112
+ const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
23113
+ const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
23114
+ if (!required_commands?.length && !required_python_modules?.length) return void 0;
23115
+ return {
23116
+ ...required_commands?.length && { required_commands },
23117
+ ...required_python_modules?.length && { required_python_modules }
22944
23118
  };
22945
23119
  }
22946
23120
  function parseDockerWorkspaceConfig(raw) {
@@ -24966,8 +25140,8 @@ init_cjs_shims();
24966
25140
 
24967
25141
  // src/evaluation/generators/rubric-generator.ts
24968
25142
  init_cjs_shims();
24969
- var import_ai4 = require("ai");
24970
25143
  var import_zod6 = require("zod");
25144
+ init_types2();
24971
25145
  var rubricItemSchema = import_zod6.z.object({
24972
25146
  id: import_zod6.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
24973
25147
  outcome: import_zod6.z.string().describe("Concrete expected outcome for this rubric item"),
@@ -24980,10 +25154,6 @@ var rubricGenerationSchema = import_zod6.z.object({
24980
25154
  async function generateRubrics(options) {
24981
25155
  const { criteria, question, referenceAnswer, provider } = options;
24982
25156
  const prompt = buildPrompt(criteria, question, referenceAnswer);
24983
- const model = provider.asLanguageModel?.();
24984
- if (!model) {
24985
- throw new Error("Provider does not support language model interface");
24986
- }
24987
25157
  const system = `You are an expert at creating evaluation rubrics.
24988
25158
  You must return a valid JSON object matching this schema:
24989
25159
  {
@@ -25000,11 +25170,11 @@ You must return a valid JSON object matching this schema:
25000
25170
  let lastError;
25001
25171
  for (let attempt = 1; attempt <= 3; attempt++) {
25002
25172
  try {
25003
- const { text } = await (0, import_ai4.generateText)({
25004
- model,
25005
- system,
25006
- prompt
25173
+ const response = await provider.invoke({
25174
+ question: prompt,
25175
+ systemPrompt: system
25007
25176
  });
25177
+ const text = extractLastAssistantContent2(response.output);
25008
25178
  const cleaned = text.replace(/```json\n?|```/g, "").trim();
25009
25179
  result = rubricGenerationSchema.parse(JSON.parse(cleaned));
25010
25180
  break;