@agentv/core 4.25.1 → 4.25.3-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -223,7 +223,7 @@ function computeTraceSummary(messages) {
223
223
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
224
224
  if (summary.eventCount === 0) return void 0;
225
225
  const explorationCalls = explorationTools.reduce(
226
- (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
226
+ (sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
227
227
  0
228
228
  );
229
229
  return explorationCalls / summary.eventCount;
@@ -5187,8 +5187,17 @@ async function materializeContentForGrader(messages, getWorkDir) {
5187
5187
  }
5188
5188
  return result;
5189
5189
  }
5190
+ async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
5191
+ return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5192
+ }
5190
5193
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
5191
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5194
+ const { stdout, stderr, exitCode } = await runScriptRaw(
5195
+ scriptPath,
5196
+ input,
5197
+ agentTimeoutMs,
5198
+ cwd,
5199
+ env
5200
+ );
5192
5201
  if (exitCode !== 0) {
5193
5202
  const trimmedErr = formatStderr(stderr);
5194
5203
  throw new Error(
@@ -5306,6 +5315,8 @@ var init_code_grader = __esm({
5306
5315
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
5307
5316
  try {
5308
5317
  let stdout;
5318
+ let exitCode = 0;
5319
+ let execStderr = "";
5309
5320
  if (context2.dockerConfig) {
5310
5321
  const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
5311
5322
  const dockerProvider = new DockerWorkspaceProvider2(context2.dockerConfig);
@@ -5314,31 +5325,42 @@ var init_code_grader = __esm({
5314
5325
  stdin: inputPayload,
5315
5326
  repoCheckouts: getRepoCheckoutTargets(context2.evalCase.workspace?.repos)
5316
5327
  });
5317
- if (result.exitCode !== 0) {
5318
- const trimmedErr = result.stderr.trim();
5319
- throw new Error(
5320
- trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
5321
- );
5322
- }
5328
+ exitCode = result.exitCode;
5323
5329
  stdout = result.stdout.trim();
5330
+ execStderr = result.stderr;
5324
5331
  } else {
5325
- stdout = await executeScript(
5332
+ const result = await runScriptRaw(
5326
5333
  this.command,
5327
5334
  inputPayload,
5328
5335
  this.agentTimeoutMs,
5329
5336
  this.cwd,
5330
5337
  env
5331
5338
  );
5339
+ exitCode = result.exitCode;
5340
+ stdout = result.stdout.trim();
5341
+ execStderr = result.stderr;
5332
5342
  }
5333
- const parsed = parseJsonSafe(stdout);
5334
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5335
- const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
5343
+ const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
5344
+ const hasStderr = execStderr.trim().length > 0;
5345
+ if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
5346
+ const trimmedErr = formatStderr(execStderr);
5347
+ throw new Error(
5348
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5349
+ );
5350
+ }
5351
+ const rawParsed = parseJsonSafe(stdout);
5352
+ const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
5353
+ const passed = exitCode === 0;
5354
+ const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
5336
5355
  (a) => typeof a === "object" && a !== null && typeof a.text === "string"
5337
5356
  ).map((a) => ({
5338
5357
  text: String(a.text),
5339
5358
  passed: Boolean(a.passed),
5340
5359
  ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
5341
- })) : [];
5360
+ })) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
5361
+ const score = parsed != null ? clampScore(
5362
+ typeof parsed.score === "number" ? parsed.score : assertions.length > 0 ? assertions.filter((a) => a.passed).length / assertions.length : 0
5363
+ ) : passed ? 1 : 0;
5342
5364
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
5343
5365
  const proxyUsage = getProxyUsage?.();
5344
5366
  const graderRawRequest = {
@@ -5646,13 +5668,6 @@ function extractImageBlocks(messages) {
5646
5668
  }
5647
5669
  return images;
5648
5670
  }
5649
- function toAiSdkImageParts(images) {
5650
- return images.map((img) => ({
5651
- type: "image",
5652
- image: img.source,
5653
- mediaType: img.media_type || void 0
5654
- }));
5655
- }
5656
5671
  function resolveSandboxed(basePath, relativePath) {
5657
5672
  const resolved = import_node_path12.default.resolve(basePath, relativePath);
5658
5673
  if (!resolved.startsWith(basePath + import_node_path12.default.sep) && resolved !== basePath) {
@@ -5661,15 +5676,24 @@ function resolveSandboxed(basePath, relativePath) {
5661
5676
  return resolved;
5662
5677
  }
5663
5678
  function createFilesystemTools(workspacePath) {
5664
- return {
5665
- list_files: (0, import_ai.tool)({
5679
+ return [
5680
+ {
5681
+ name: "list_files",
5666
5682
  description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
5667
- inputSchema: import_zod2.z.object({
5668
- path: import_zod2.z.string().describe('Relative path within workspace (use "." for root)').default(".")
5669
- }),
5683
+ parameters: {
5684
+ type: "object",
5685
+ properties: {
5686
+ path: {
5687
+ type: "string",
5688
+ description: 'Relative path within workspace (use "." for root)',
5689
+ default: "."
5690
+ }
5691
+ }
5692
+ },
5670
5693
  execute: async (input) => {
5694
+ const args = input ?? {};
5671
5695
  try {
5672
- const resolved = resolveSandboxed(workspacePath, input.path);
5696
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
5673
5697
  const entries = await import_promises12.default.readdir(resolved, { withFileTypes: true });
5674
5698
  return entries.map((e) => ({
5675
5699
  name: e.name,
@@ -5679,18 +5703,25 @@ function createFilesystemTools(workspacePath) {
5679
5703
  return { error: error instanceof Error ? error.message : String(error) };
5680
5704
  }
5681
5705
  }
5682
- }),
5683
- read_file: (0, import_ai.tool)({
5706
+ },
5707
+ {
5708
+ name: "read_file",
5684
5709
  description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
5685
- inputSchema: import_zod2.z.object({
5686
- path: import_zod2.z.string().describe("Relative path to file within workspace")
5687
- }),
5710
+ parameters: {
5711
+ type: "object",
5712
+ properties: {
5713
+ path: { type: "string", description: "Relative path to file within workspace" }
5714
+ },
5715
+ required: ["path"]
5716
+ },
5688
5717
  execute: async (input) => {
5718
+ const args = input ?? {};
5719
+ const relPath = args.path ?? "";
5689
5720
  try {
5690
- const resolved = resolveSandboxed(workspacePath, input.path);
5721
+ const resolved = resolveSandboxed(workspacePath, relPath);
5691
5722
  const stat14 = await import_promises12.default.stat(resolved);
5692
5723
  if (stat14.isDirectory()) {
5693
- return { error: `'${input.path}' is a directory, not a file` };
5724
+ return { error: `'${relPath}' is a directory, not a file` };
5694
5725
  }
5695
5726
  const buffer = Buffer.alloc(Math.min(stat14.size, MAX_FILE_SIZE));
5696
5727
  const fd = await import_promises12.default.open(resolved, "r");
@@ -5706,19 +5737,29 @@ function createFilesystemTools(workspacePath) {
5706
5737
  return { error: error instanceof Error ? error.message : String(error) };
5707
5738
  }
5708
5739
  }
5709
- }),
5710
- search_files: (0, import_ai.tool)({
5740
+ },
5741
+ {
5742
+ name: "search_files",
5711
5743
  description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
5712
- inputSchema: import_zod2.z.object({
5713
- pattern: import_zod2.z.string().describe("Regex pattern to search for"),
5714
- path: import_zod2.z.string().describe('Relative path to search within (use "." for root)').default(".")
5715
- }),
5744
+ parameters: {
5745
+ type: "object",
5746
+ properties: {
5747
+ pattern: { type: "string", description: "Regex pattern to search for" },
5748
+ path: {
5749
+ type: "string",
5750
+ description: 'Relative path to search within (use "." for root)',
5751
+ default: "."
5752
+ }
5753
+ },
5754
+ required: ["pattern"]
5755
+ },
5716
5756
  execute: async (input) => {
5757
+ const args = input ?? {};
5717
5758
  try {
5718
- const resolved = resolveSandboxed(workspacePath, input.path);
5759
+ const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
5719
5760
  let regex;
5720
5761
  try {
5721
- regex = new RegExp(input.pattern, "gi");
5762
+ regex = new RegExp(args.pattern ?? "", "gi");
5722
5763
  } catch (regexErr) {
5723
5764
  return {
5724
5765
  error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
@@ -5731,8 +5772,8 @@ function createFilesystemTools(workspacePath) {
5731
5772
  return { error: error instanceof Error ? error.message : String(error) };
5732
5773
  }
5733
5774
  }
5734
- })
5735
- };
5775
+ }
5776
+ ];
5736
5777
  }
5737
5778
  async function searchDirectory(dirPath, workspacePath, regex, matches) {
5738
5779
  if (matches.length >= MAX_SEARCH_MATCHES) return;
@@ -5772,14 +5813,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
5772
5813
  }
5773
5814
  }
5774
5815
  }
5775
- var import_promises12, import_node_path12, import_ai, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
5816
+ var import_promises12, import_node_path12, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
5776
5817
  var init_llm_grader = __esm({
5777
5818
  "src/evaluation/graders/llm-grader.ts"() {
5778
5819
  "use strict";
5779
5820
  init_cjs_shims();
5780
5821
  import_promises12 = __toESM(require("fs/promises"), 1);
5781
5822
  import_node_path12 = __toESM(require("path"), 1);
5782
- import_ai = require("ai");
5783
5823
  import_zod2 = require("zod");
5784
5824
  init_content_preprocessor();
5785
5825
  init_content();
@@ -6095,18 +6135,15 @@ ${context2.toolCalls}`;
6095
6135
  }
6096
6136
  }
6097
6137
  // ---------------------------------------------------------------------------
6098
- // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools)
6138
+ // Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
6099
6139
  // ---------------------------------------------------------------------------
6100
6140
  /**
6101
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
6141
+ * Built-in mode: drives the grader through provider.invoke() with the
6142
+ * sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
6143
+ * provider runs the agent loop (tool call → tool execute → next model
6144
+ * turn) until the model stops requesting tools or maxSteps is hit.
6102
6145
  */
6103
6146
  async evaluateBuiltIn(context2, graderProvider) {
6104
- const model = graderProvider.asLanguageModel?.();
6105
- if (!model) {
6106
- throw new Error(
6107
- `Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
6108
- );
6109
- }
6110
6147
  const workspacePath = context2.workspacePath;
6111
6148
  if (!workspacePath) {
6112
6149
  throw new Error(
@@ -6125,18 +6162,21 @@ ${context2.toolCalls}`;
6125
6162
  maxSteps: this.maxSteps
6126
6163
  };
6127
6164
  try {
6128
- const { text, steps } = await (0, import_ai.generateText)({
6129
- model,
6130
- system: systemPrompt,
6131
- prompt: userPrompt,
6165
+ const response = await graderProvider.invoke({
6166
+ question: userPrompt,
6167
+ systemPrompt,
6168
+ evalCaseId: context2.evalCase.id,
6169
+ attempt: context2.attempt,
6170
+ temperature: this.temperature ?? 0,
6132
6171
  tools: fsTools,
6133
- stopWhen: (0, import_ai.stepCountIs)(this.maxSteps),
6134
- temperature: this.temperature ?? 0
6172
+ maxSteps: this.maxSteps
6135
6173
  });
6136
- const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
6174
+ const text = extractLastAssistantContent2(response.output);
6175
+ const stepCount = response.steps?.count ?? 1;
6176
+ const toolCallCount = response.steps?.toolCallCount ?? 0;
6137
6177
  const details = {
6138
6178
  mode: "built-in",
6139
- steps: steps.length,
6179
+ steps: stepCount,
6140
6180
  tool_calls: toolCallCount
6141
6181
  };
6142
6182
  return this.parseAgentResult(
@@ -6588,43 +6628,14 @@ ${outputSchema}`;
6588
6628
  }
6589
6629
  async generateStructuredResponse(options) {
6590
6630
  const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
6591
- const model = graderProvider.asLanguageModel?.();
6592
- if (model) {
6593
- const modelOptions = {
6594
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
6595
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
6596
- };
6597
- const hasImages = images && images.length > 0;
6598
- const result = hasImages ? await (0, import_ai.generateText)({
6599
- model,
6600
- system: systemPrompt,
6601
- messages: [
6602
- {
6603
- role: "user",
6604
- content: [
6605
- { type: "text", text: userPrompt },
6606
- ...toAiSdkImageParts(images)
6607
- ]
6608
- }
6609
- ],
6610
- ...modelOptions
6611
- }) : await (0, import_ai.generateText)({
6612
- model,
6613
- system: systemPrompt,
6614
- prompt: userPrompt,
6615
- ...modelOptions
6616
- });
6617
- const rawUsage = result.usage;
6618
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
6619
- return { text: result.text, tokenUsage };
6620
- }
6621
6631
  const response = await graderProvider.invoke({
6622
6632
  question: userPrompt,
6623
6633
  systemPrompt,
6624
6634
  evalCaseId: context2.evalCase.id,
6625
6635
  attempt: context2.attempt,
6626
6636
  maxOutputTokens: this.maxOutputTokens,
6627
- temperature: this.temperature
6637
+ temperature: this.temperature,
6638
+ ...images && images.length > 0 ? { images } : {}
6628
6639
  });
6629
6640
  return {
6630
6641
  text: extractLastAssistantContent2(response.output),
@@ -6640,12 +6651,11 @@ ${outputSchema}`;
6640
6651
  });
6641
6652
 
6642
6653
  // src/evaluation/graders/composite.ts
6643
- var import_ai2, DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
6654
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
6644
6655
  var init_composite = __esm({
6645
6656
  "src/evaluation/graders/composite.ts"() {
6646
6657
  "use strict";
6647
6658
  init_cjs_shims();
6648
- import_ai2 = require("ai");
6649
6659
  init_types2();
6650
6660
  init_code_grader();
6651
6661
  init_llm_grader();
@@ -6888,25 +6898,6 @@ Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`
6888
6898
  target: graderProvider.targetName
6889
6899
  };
6890
6900
  try {
6891
- const model = graderProvider.asLanguageModel?.();
6892
- if (model) {
6893
- const { text } = await (0, import_ai2.generateText)({
6894
- model,
6895
- system: systemPrompt,
6896
- prompt: userPrompt
6897
- });
6898
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6899
- const score2 = clampScore(data2.score);
6900
- const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
6901
- return {
6902
- score: score2,
6903
- verdict: scoreToVerdict(score2),
6904
- assertions: assertions2,
6905
- expectedAspectCount: Math.max(assertions2.length, 1),
6906
- graderRawRequest,
6907
- scores
6908
- };
6909
- }
6910
6901
  const response = await graderProvider.invoke({
6911
6902
  question: userPrompt,
6912
6903
  systemPrompt,
@@ -8689,115 +8680,254 @@ var init_graders2 = __esm({
8689
8680
  }
8690
8681
  });
8691
8682
 
8692
- // src/evaluation/providers/agentv-provider.ts
8693
- var agentv_provider_exports = {};
8694
- __export(agentv_provider_exports, {
8695
- AgentvProvider: () => AgentvProvider
8696
- });
8697
- function parseModelString(model) {
8698
- const colonIndex = model.indexOf(":");
8699
- if (colonIndex === -1) {
8700
- throw new Error(
8701
- `Invalid model string "${model}". Expected format "provider:model" (e.g., "openai:gpt-5-mini")`
8683
+ // src/evaluation/providers/llm-providers.ts
8684
+ function buildAzureBaseUrl(input) {
8685
+ const trimmed = input.replace(/\/+$/, "");
8686
+ if (trimmed.endsWith("/openai/v1")) return trimmed;
8687
+ if (trimmed.endsWith("/openai")) return `${trimmed}/v1`;
8688
+ return `${trimmed}/openai/v1`;
8689
+ }
8690
+ async function invokePiAi(options) {
8691
+ const { model, apiKey, request, defaults, retryConfig, providerOptions } = options;
8692
+ const tools = request.tools && request.tools.length > 0 ? request.tools : void 0;
8693
+ const maxSteps = tools ? Math.max(1, request.maxSteps ?? 1) : 1;
8694
+ const { systemPrompt, messages } = chatPromptToPiContext(buildChatPrompt(request));
8695
+ if (request.images && request.images.length > 0) {
8696
+ attachImagesToLastUserMessage(messages, request.images);
8697
+ }
8698
+ const piTools = tools ? tools.map((t) => ({
8699
+ name: t.name,
8700
+ description: t.description,
8701
+ parameters: t.parameters
8702
+ })) : void 0;
8703
+ const ctx = { systemPrompt, messages, ...piTools ? { tools: piTools } : {} };
8704
+ const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
8705
+ const callOptions = {
8706
+ ...apiKey !== void 0 ? { apiKey } : {},
8707
+ temperature,
8708
+ ...maxOutputTokens !== void 0 ? { maxTokens: maxOutputTokens } : {},
8709
+ signal: request.signal,
8710
+ ...providerOptions ?? {}
8711
+ };
8712
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
8713
+ const startMs = Date.now();
8714
+ const aggregateUsage = { input: 0, output: 0, cacheRead: 0, cost: 0 };
8715
+ let stepCount = 0;
8716
+ let toolCallCount = 0;
8717
+ let result = await withRetry(
8718
+ () => (0, import_pi_ai.complete)(model, ctx, callOptions),
8719
+ retryConfig,
8720
+ request.signal
8721
+ );
8722
+ ctx.messages.push(result);
8723
+ stepCount = 1;
8724
+ accumulateUsage(aggregateUsage, result.usage);
8725
+ while (tools) {
8726
+ const calls = result.content.filter(
8727
+ (b) => b.type === "toolCall"
8728
+ );
8729
+ if (calls.length === 0) break;
8730
+ if (stepCount >= maxSteps) break;
8731
+ toolCallCount += calls.length;
8732
+ for (const call of calls) {
8733
+ const tool = tools.find((t) => t.name === call.name);
8734
+ let output;
8735
+ let isError = false;
8736
+ try {
8737
+ if (!tool) {
8738
+ throw new Error(`pi-ai adapter: model called unknown tool '${call.name}'`);
8739
+ }
8740
+ output = await tool.execute(call.arguments);
8741
+ } catch (err) {
8742
+ output = err instanceof Error ? err.message : String(err);
8743
+ isError = true;
8744
+ }
8745
+ ctx.messages.push({
8746
+ role: "toolResult",
8747
+ toolCallId: call.id,
8748
+ toolName: call.name,
8749
+ content: [
8750
+ { type: "text", text: typeof output === "string" ? output : JSON.stringify(output) }
8751
+ ],
8752
+ isError,
8753
+ timestamp: Date.now()
8754
+ });
8755
+ }
8756
+ result = await withRetry(
8757
+ () => (0, import_pi_ai.complete)(model, ctx, callOptions),
8758
+ retryConfig,
8759
+ request.signal
8702
8760
  );
8761
+ ctx.messages.push(result);
8762
+ stepCount += 1;
8763
+ accumulateUsage(aggregateUsage, result.usage);
8703
8764
  }
8704
- return {
8705
- provider: model.slice(0, colonIndex),
8706
- modelName: model.slice(colonIndex + 1)
8707
- };
8765
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
8766
+ const durationMs = Date.now() - startMs;
8767
+ return mapPiResponse(result, {
8768
+ durationMs,
8769
+ startTime,
8770
+ endTime,
8771
+ aggregateUsage,
8772
+ steps: tools ? { count: stepCount, toolCallCount } : void 0
8773
+ });
8708
8774
  }
8709
- function createLanguageModel(modelString) {
8710
- const { provider, modelName } = parseModelString(modelString);
8711
- switch (provider) {
8712
- case "openai":
8713
- return (0, import_openai.createOpenAI)()(modelName);
8714
- case "anthropic":
8715
- return (0, import_anthropic.createAnthropic)()(modelName);
8716
- case "azure":
8717
- return (0, import_azure.createAzure)().chat(modelName);
8718
- case "google":
8719
- return (0, import_google.createGoogleGenerativeAI)()(modelName);
8720
- default:
8775
+ function accumulateUsage(agg, u) {
8776
+ agg.input += u.input;
8777
+ agg.output += u.output;
8778
+ agg.cacheRead += u.cacheRead;
8779
+ agg.cost += u.cost.total;
8780
+ }
8781
+ function resolvePiModel(args) {
8782
+ const { providerName, apiId, modelId, baseUrl } = args;
8783
+ let model;
8784
+ try {
8785
+ model = (0, import_pi_ai.getModel)(providerName, modelId);
8786
+ } catch {
8787
+ model = void 0;
8788
+ }
8789
+ if (!model) {
8790
+ const fallbackBaseUrl = baseUrl ?? defaultBaseUrlFor(providerName);
8791
+ if (!fallbackBaseUrl) {
8721
8792
  throw new Error(
8722
- `Unsupported AI SDK provider "${provider}" in model string "${modelString}". Supported providers: openai, anthropic, azure, google`
8793
+ `pi-ai adapter cannot resolve a baseUrl for provider '${providerName}' / model '${modelId}'. Either set the target's baseUrl/endpoint or use a model id pi-ai recognizes.`
8723
8794
  );
8724
- }
8725
- }
8726
- var import_anthropic, import_azure, import_google, import_openai, AgentvProvider;
8727
- var init_agentv_provider = __esm({
8728
- "src/evaluation/providers/agentv-provider.ts"() {
8729
- "use strict";
8730
- init_cjs_shims();
8731
- import_anthropic = require("@ai-sdk/anthropic");
8732
- import_azure = require("@ai-sdk/azure");
8733
- import_google = require("@ai-sdk/google");
8734
- import_openai = require("@ai-sdk/openai");
8735
- AgentvProvider = class {
8736
- id;
8737
- kind = "agentv";
8738
- targetName;
8739
- model;
8740
- constructor(targetName, config) {
8741
- this.id = `agentv:${targetName}`;
8742
- this.targetName = targetName;
8743
- this.model = createLanguageModel(config.model);
8744
- }
8745
- /**
8746
- * Direct invoke is not supported for the agentv provider.
8747
- * Use asLanguageModel() with generateText() instead.
8748
- */
8749
- async invoke(_request) {
8750
- throw new Error(
8751
- "AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead."
8752
- );
8753
- }
8754
- /**
8755
- * Returns the resolved AI SDK LanguageModel for use with generateText/generateObject.
8756
- */
8757
- asLanguageModel() {
8758
- return this.model;
8759
- }
8795
+ }
8796
+ model = {
8797
+ id: modelId,
8798
+ name: modelId,
8799
+ api: apiId,
8800
+ provider: providerName,
8801
+ baseUrl: fallbackBaseUrl,
8802
+ reasoning: false,
8803
+ input: ["text"],
8804
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
8805
+ contextWindow: 128e3,
8806
+ maxTokens: 16384
8760
8807
  };
8761
8808
  }
8762
- });
8763
-
8764
- // src/evaluation/providers/ai-sdk.ts
8765
- function buildAzureOptions(config) {
8766
- const options = {
8767
- apiKey: config.apiKey,
8768
- apiVersion: config.version,
8769
- // Chat completions still use deployment-scoped Azure URLs for compatibility
8770
- // with existing deployments. Responses API should use the SDK's v1 path.
8771
- useDeploymentBasedUrls: config.apiFormat !== "responses"
8772
- };
8773
- const baseURL = normalizeAzureBaseUrl(config.resourceName);
8774
- if (baseURL) {
8775
- options.baseURL = baseURL;
8776
- } else {
8777
- options.resourceName = config.resourceName;
8809
+ if (model.api !== apiId) {
8810
+ model = { ...model, api: apiId };
8811
+ }
8812
+ if (baseUrl) {
8813
+ model = { ...model, baseUrl };
8778
8814
  }
8779
- return options;
8815
+ return model;
8780
8816
  }
8781
- function normalizeAzureBaseUrl(resourceName) {
8782
- const trimmed = resourceName.trim();
8783
- if (!/^https?:\/\//i.test(trimmed)) {
8784
- return void 0;
8817
+ function defaultBaseUrlFor(providerName) {
8818
+ if (providerName === "openai") return "https://api.openai.com/v1";
8819
+ if (providerName === "openrouter") return "https://openrouter.ai/api/v1";
8820
+ return void 0;
8821
+ }
8822
+ function chatPromptToPiContext(chatPrompt) {
8823
+ const systemSegments = [];
8824
+ const messages = [];
8825
+ const now = Date.now();
8826
+ for (const message of chatPrompt) {
8827
+ if (message.role === "system") {
8828
+ systemSegments.push(message.content);
8829
+ continue;
8830
+ }
8831
+ if (message.role === "user") {
8832
+ messages.push({ role: "user", content: message.content, timestamp: now });
8833
+ continue;
8834
+ }
8835
+ if (message.role === "assistant") {
8836
+ messages.push({
8837
+ role: "assistant",
8838
+ content: [{ type: "text", text: message.content }],
8839
+ api: "",
8840
+ provider: "",
8841
+ model: "",
8842
+ usage: {
8843
+ input: 0,
8844
+ output: 0,
8845
+ cacheRead: 0,
8846
+ cacheWrite: 0,
8847
+ totalTokens: 0,
8848
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
8849
+ },
8850
+ stopReason: "stop",
8851
+ timestamp: now
8852
+ });
8853
+ continue;
8854
+ }
8855
+ if (message.role === "tool" || message.role === "function") {
8856
+ const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
8857
+ messages.push({
8858
+ role: "assistant",
8859
+ content: [{ type: "text", text: `${prefix}${message.content}` }],
8860
+ api: "",
8861
+ provider: "",
8862
+ model: "",
8863
+ usage: {
8864
+ input: 0,
8865
+ output: 0,
8866
+ cacheRead: 0,
8867
+ cacheWrite: 0,
8868
+ totalTokens: 0,
8869
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
8870
+ },
8871
+ stopReason: "stop",
8872
+ timestamp: now
8873
+ });
8874
+ continue;
8875
+ }
8876
+ throw new Error(`pi-ai adapter received unsupported message role '${message.role}'.`);
8785
8877
  }
8786
- const withoutSlash = trimmed.replace(/\/+$/, "");
8787
- const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
8788
- return normalized;
8878
+ return {
8879
+ systemPrompt: systemSegments.length > 0 ? systemSegments.join("\n\n") : void 0,
8880
+ messages
8881
+ };
8789
8882
  }
8790
- function buildAnthropicProviderOptions(defaults) {
8791
- if (defaults.thinkingBudget === void 0) {
8792
- return void 0;
8883
+ function attachImagesToLastUserMessage(messages, images) {
8884
+ if (!images || images.length === 0) return;
8885
+ for (let i = messages.length - 1; i >= 0; i--) {
8886
+ const m = messages[i];
8887
+ if (m.role !== "user") continue;
8888
+ const text = typeof m.content === "string" ? m.content : "";
8889
+ messages[i] = {
8890
+ ...m,
8891
+ content: [
8892
+ ...text ? [{ type: "text", text }] : [],
8893
+ ...images.map((img) => ({
8894
+ type: "image",
8895
+ data: img.source,
8896
+ mimeType: img.media_type
8897
+ }))
8898
+ ]
8899
+ };
8900
+ return;
8793
8901
  }
8902
+ messages.push({
8903
+ role: "user",
8904
+ content: images.map((img) => ({
8905
+ type: "image",
8906
+ data: img.source,
8907
+ mimeType: img.media_type
8908
+ })),
8909
+ timestamp: Date.now()
8910
+ });
8911
+ }
8912
+ function mapPiResponse(result, timing) {
8913
+ const text = result.content.filter((b) => b.type === "text").map((b) => b.text).join("");
8914
+ const cached = timing.aggregateUsage.cacheRead > 0 ? timing.aggregateUsage.cacheRead : void 0;
8915
+ const tokenUsage = {
8916
+ input: timing.aggregateUsage.input,
8917
+ output: timing.aggregateUsage.output,
8918
+ ...cached !== void 0 ? { cached } : {}
8919
+ };
8920
+ const costUsd = timing.aggregateUsage.cost > 0 ? timing.aggregateUsage.cost : void 0;
8794
8921
  return {
8795
- anthropic: {
8796
- thinking: {
8797
- type: "enabled",
8798
- budgetTokens: defaults.thinkingBudget
8799
- }
8800
- }
8922
+ raw: result,
8923
+ usage: toJsonObject(result.usage),
8924
+ output: [{ role: "assistant", content: text }],
8925
+ tokenUsage,
8926
+ ...costUsd !== void 0 ? { costUsd } : {},
8927
+ durationMs: timing.durationMs,
8928
+ startTime: timing.startTime,
8929
+ endTime: timing.endTime,
8930
+ ...timing.steps ? { steps: timing.steps } : {}
8801
8931
  };
8802
8932
  }
8803
8933
  function buildChatPrompt(request) {
@@ -8812,92 +8942,21 @@ function buildChatPrompt(request) {
8812
8942
  }
8813
8943
  const systemContent = resolveSystemContent(request);
8814
8944
  const userContent = request.question.trim();
8815
- const prompt = [
8945
+ return [
8816
8946
  { role: "system", content: systemContent },
8817
8947
  { role: "user", content: userContent }
8818
8948
  ];
8819
- return prompt;
8820
8949
  }
8821
8950
  function resolveSystemContent(request) {
8822
- const systemSegments = [];
8823
8951
  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
8824
- systemSegments.push(request.systemPrompt.trim());
8825
- } else {
8826
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
8952
+ return request.systemPrompt.trim();
8827
8953
  }
8828
- return systemSegments.join("\n\n");
8829
- }
8830
- function toModelMessages(chatPrompt) {
8831
- return chatPrompt.map((message) => {
8832
- if (message.role === "tool" || message.role === "function") {
8833
- const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
8834
- return {
8835
- role: "assistant",
8836
- content: `${prefix}${message.content}`
8837
- };
8838
- }
8839
- if (message.role === "assistant" || message.role === "system" || message.role === "user") {
8840
- return {
8841
- role: message.role,
8842
- content: message.content
8843
- };
8844
- }
8845
- return {
8846
- role: "user",
8847
- content: message.content
8848
- };
8849
- });
8954
+ return DEFAULT_SYSTEM_PROMPT;
8850
8955
  }
8851
8956
  function resolveModelSettings(request, defaults) {
8852
- const temperature = request.temperature ?? defaults.temperature;
8853
- const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
8854
8957
  return {
8855
- temperature,
8856
- maxOutputTokens
8857
- };
8858
- }
8859
- async function invokeModel(options) {
8860
- const { model, request, defaults, retryConfig, providerOptions } = options;
8861
- const chatPrompt = buildChatPrompt(request);
8862
- const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
8863
- const startTime = (/* @__PURE__ */ new Date()).toISOString();
8864
- const startMs = Date.now();
8865
- const result = await withRetry(
8866
- () => (0, import_ai3.generateText)({
8867
- model,
8868
- messages: toModelMessages(chatPrompt),
8869
- temperature,
8870
- maxOutputTokens,
8871
- maxRetries: 0,
8872
- abortSignal: request.signal,
8873
- ...providerOptions ? { providerOptions } : {}
8874
- }),
8875
- retryConfig,
8876
- request.signal
8877
- );
8878
- const endTime = (/* @__PURE__ */ new Date()).toISOString();
8879
- const durationMs = Date.now() - startMs;
8880
- return mapResponse(result, { durationMs, startTime, endTime });
8881
- }
8882
- function mapResponse(result, timing) {
8883
- const content = result.text ?? "";
8884
- const rawUsage = result.totalUsage ?? result.usage;
8885
- const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
8886
- const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
8887
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
8888
- input: rawUsage.inputTokens,
8889
- output: rawUsage.outputTokens,
8890
- ...reasoning != null ? { reasoning } : {},
8891
- ...cached != null ? { cached } : {}
8892
- } : void 0;
8893
- return {
8894
- raw: result,
8895
- usage: toJsonObject(rawUsage),
8896
- output: [{ role: "assistant", content }],
8897
- tokenUsage,
8898
- durationMs: timing?.durationMs,
8899
- startTime: timing?.startTime,
8900
- endTime: timing?.endTime
8958
+ temperature: request.temperature ?? defaults.temperature,
8959
+ maxOutputTokens: request.maxOutputTokens ?? defaults.maxOutputTokens
8901
8960
  };
8902
8961
  }
8903
8962
  function toJsonObject(value) {
@@ -8911,9 +8970,7 @@ function toJsonObject(value) {
8911
8970
  }
8912
8971
  }
8913
8972
  function extractStatus(error) {
8914
- if (!error || typeof error !== "object") {
8915
- return void 0;
8916
- }
8973
+ if (!error || typeof error !== "object") return void 0;
8917
8974
  const candidate = error;
8918
8975
  const directStatus = candidate.status ?? candidate.statusCode;
8919
8976
  if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
@@ -8928,21 +8985,15 @@ function extractStatus(error) {
8928
8985
  const match = message.match(/HTTP\s+(\d{3})/i);
8929
8986
  if (match) {
8930
8987
  const parsed = Number.parseInt(match[1], 10);
8931
- if (Number.isFinite(parsed)) {
8932
- return parsed;
8933
- }
8988
+ if (Number.isFinite(parsed)) return parsed;
8934
8989
  }
8935
8990
  }
8936
8991
  return void 0;
8937
8992
  }
8938
8993
  function isNetworkError(error) {
8939
- if (!error || typeof error !== "object") {
8940
- return false;
8941
- }
8994
+ if (!error || typeof error !== "object") return false;
8942
8995
  const candidate = error;
8943
- if (candidate.name === "AbortError") {
8944
- return false;
8945
- }
8996
+ if (candidate.name === "AbortError") return false;
8946
8997
  const code = candidate.code;
8947
8998
  if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
8948
8999
  return true;
@@ -8955,12 +9006,8 @@ function isNetworkError(error) {
8955
9006
  }
8956
9007
  function isRetryableError(error, retryableStatusCodes) {
8957
9008
  const status = extractStatus(error);
8958
- if (status === 401 || status === 403) {
8959
- return false;
8960
- }
8961
- if (typeof status === "number") {
8962
- return retryableStatusCodes.includes(status);
8963
- }
9009
+ if (status === 401 || status === 403) return false;
9010
+ if (typeof status === "number") return retryableStatusCodes.includes(status);
8964
9011
  return isNetworkError(error);
8965
9012
  }
8966
9013
  function calculateRetryDelay(attempt, config) {
@@ -8990,195 +9037,266 @@ async function withRetry(fn, retryConfig, signal) {
8990
9037
  return await fn();
8991
9038
  } catch (error) {
8992
9039
  lastError = error;
8993
- if (attempt >= config.maxRetries) {
8994
- break;
8995
- }
8996
- if (!isRetryableError(error, config.retryableStatusCodes)) {
8997
- throw error;
8998
- }
9040
+ if (attempt >= config.maxRetries) break;
9041
+ if (!isRetryableError(error, config.retryableStatusCodes)) throw error;
8999
9042
  const delay = calculateRetryDelay(attempt, config);
9000
9043
  await sleep(delay);
9001
9044
  }
9002
9045
  }
9003
9046
  throw lastError;
9004
9047
  }
9005
- var import_anthropic2, import_azure2, import_google2, import_openai2, import_ai_sdk_provider, import_ai3, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, AzureProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider;
9006
- var init_ai_sdk = __esm({
9007
- "src/evaluation/providers/ai-sdk.ts"() {
9048
+ var import_pi_ai, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider, AzureProvider;
9049
+ var init_llm_providers = __esm({
9050
+ "src/evaluation/providers/llm-providers.ts"() {
9008
9051
  "use strict";
9009
9052
  init_cjs_shims();
9010
- import_anthropic2 = require("@ai-sdk/anthropic");
9011
- import_azure2 = require("@ai-sdk/azure");
9012
- import_google2 = require("@ai-sdk/google");
9013
- import_openai2 = require("@ai-sdk/openai");
9014
- import_ai_sdk_provider = require("@openrouter/ai-sdk-provider");
9015
- import_ai3 = require("ai");
9053
+ import_pi_ai = require("@mariozechner/pi-ai");
9054
+ (0, import_pi_ai.registerBuiltInApiProviders)();
9016
9055
  DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
9017
9056
  OpenAIProvider = class {
9018
- constructor(targetName, config) {
9019
- this.config = config;
9020
- this.id = `openai:${targetName}`;
9021
- this.targetName = targetName;
9022
- this.defaults = {
9023
- temperature: config.temperature,
9024
- maxOutputTokens: config.maxOutputTokens
9025
- };
9026
- this.retryConfig = config.retry;
9027
- const openai = (0, import_openai2.createOpenAI)({
9028
- apiKey: config.apiKey,
9029
- baseURL: config.baseURL
9030
- });
9031
- this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
9032
- }
9033
9057
  id;
9034
9058
  kind = "openai";
9035
9059
  targetName;
9036
- model;
9060
+ piModel;
9037
9061
  defaults;
9038
9062
  retryConfig;
9039
- async invoke(request) {
9040
- return invokeModel({
9041
- model: this.model,
9042
- request,
9043
- defaults: this.defaults,
9044
- retryConfig: this.retryConfig
9045
- });
9046
- }
9047
- asLanguageModel() {
9048
- return this.model;
9049
- }
9050
- };
9051
- AzureProvider = class {
9063
+ apiKey;
9052
9064
  constructor(targetName, config) {
9053
- this.config = config;
9054
- this.id = `azure:${targetName}`;
9065
+ this.id = `openai:${targetName}`;
9055
9066
  this.targetName = targetName;
9067
+ this.apiKey = config.apiKey;
9056
9068
  this.defaults = {
9057
9069
  temperature: config.temperature,
9058
9070
  maxOutputTokens: config.maxOutputTokens
9059
9071
  };
9060
9072
  this.retryConfig = config.retry;
9061
- const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
9062
- this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
9073
+ this.piModel = resolvePiModel({
9074
+ providerName: "openai",
9075
+ apiId: config.apiFormat === "responses" ? "openai-responses" : "openai-completions",
9076
+ modelId: config.model,
9077
+ baseUrl: config.baseURL
9078
+ });
9063
9079
  }
9064
- id;
9065
- kind = "azure";
9066
- targetName;
9067
- model;
9068
- defaults;
9069
- retryConfig;
9070
9080
  async invoke(request) {
9071
- return invokeModel({
9072
- model: this.model,
9081
+ return invokePiAi({
9082
+ model: this.piModel,
9083
+ apiKey: this.apiKey,
9073
9084
  request,
9074
9085
  defaults: this.defaults,
9075
9086
  retryConfig: this.retryConfig
9076
9087
  });
9077
9088
  }
9078
- asLanguageModel() {
9079
- return this.model;
9080
- }
9081
9089
  };
9082
9090
  OpenRouterProvider = class {
9091
+ id;
9092
+ kind = "openrouter";
9093
+ targetName;
9094
+ piModel;
9095
+ defaults;
9096
+ retryConfig;
9097
+ apiKey;
9083
9098
  constructor(targetName, config) {
9084
- this.config = config;
9085
9099
  this.id = `openrouter:${targetName}`;
9086
9100
  this.targetName = targetName;
9101
+ this.apiKey = config.apiKey;
9087
9102
  this.defaults = {
9088
9103
  temperature: config.temperature,
9089
9104
  maxOutputTokens: config.maxOutputTokens
9090
9105
  };
9091
9106
  this.retryConfig = config.retry;
9092
- const openrouter = (0, import_ai_sdk_provider.createOpenRouter)({
9093
- apiKey: config.apiKey
9107
+ this.piModel = resolvePiModel({
9108
+ providerName: "openrouter",
9109
+ apiId: "openai-completions",
9110
+ modelId: config.model,
9111
+ baseUrl: "https://openrouter.ai/api/v1"
9094
9112
  });
9095
- this.model = openrouter(config.model);
9096
9113
  }
9097
- id;
9098
- kind = "openrouter";
9099
- targetName;
9100
- model;
9101
- defaults;
9102
- retryConfig;
9103
9114
  async invoke(request) {
9104
- return invokeModel({
9105
- model: this.model,
9115
+ return invokePiAi({
9116
+ model: this.piModel,
9117
+ apiKey: this.apiKey,
9106
9118
  request,
9107
9119
  defaults: this.defaults,
9108
9120
  retryConfig: this.retryConfig
9109
9121
  });
9110
9122
  }
9111
- asLanguageModel() {
9112
- return this.model;
9113
- }
9114
9123
  };
9115
9124
  AnthropicProvider = class {
9125
+ id;
9126
+ kind = "anthropic";
9127
+ targetName;
9128
+ piModel;
9129
+ defaults;
9130
+ retryConfig;
9131
+ apiKey;
9132
+ thinkingBudget;
9116
9133
  constructor(targetName, config) {
9117
- this.config = config;
9118
9134
  this.id = `anthropic:${targetName}`;
9119
9135
  this.targetName = targetName;
9136
+ this.apiKey = config.apiKey;
9137
+ this.thinkingBudget = config.thinkingBudget;
9120
9138
  this.defaults = {
9121
9139
  temperature: config.temperature,
9122
9140
  maxOutputTokens: config.maxOutputTokens,
9123
9141
  thinkingBudget: config.thinkingBudget
9124
9142
  };
9125
9143
  this.retryConfig = config.retry;
9126
- const anthropic = (0, import_anthropic2.createAnthropic)({
9127
- apiKey: config.apiKey
9144
+ this.piModel = resolvePiModel({
9145
+ providerName: "anthropic",
9146
+ apiId: "anthropic-messages",
9147
+ modelId: config.model
9128
9148
  });
9129
- this.model = anthropic(config.model);
9130
9149
  }
9131
- id;
9132
- kind = "anthropic";
9133
- targetName;
9134
- model;
9135
- defaults;
9136
- retryConfig;
9137
9150
  async invoke(request) {
9138
- const providerOptions = buildAnthropicProviderOptions(this.defaults);
9139
- return invokeModel({
9140
- model: this.model,
9151
+ const providerOptions = this.thinkingBudget !== void 0 ? { thinkingEnabled: true, thinkingBudgetTokens: this.thinkingBudget } : void 0;
9152
+ return invokePiAi({
9153
+ model: this.piModel,
9154
+ apiKey: this.apiKey,
9141
9155
  request,
9142
9156
  defaults: this.defaults,
9143
9157
  retryConfig: this.retryConfig,
9144
- providerOptions
9158
+ ...providerOptions ? { providerOptions } : {}
9145
9159
  });
9146
9160
  }
9147
- asLanguageModel() {
9148
- return this.model;
9149
- }
9150
9161
  };
9151
9162
  GeminiProvider = class {
9163
+ id;
9164
+ kind = "gemini";
9165
+ targetName;
9166
+ piModel;
9167
+ defaults;
9168
+ retryConfig;
9169
+ apiKey;
9152
9170
  constructor(targetName, config) {
9153
- this.config = config;
9154
9171
  this.id = `gemini:${targetName}`;
9155
9172
  this.targetName = targetName;
9173
+ this.apiKey = config.apiKey;
9156
9174
  this.defaults = {
9157
9175
  temperature: config.temperature,
9158
9176
  maxOutputTokens: config.maxOutputTokens
9159
9177
  };
9160
9178
  this.retryConfig = config.retry;
9161
- const google = (0, import_google2.createGoogleGenerativeAI)({
9162
- apiKey: config.apiKey
9179
+ this.piModel = resolvePiModel({
9180
+ providerName: "google",
9181
+ apiId: "google-generative-ai",
9182
+ modelId: config.model
9183
+ });
9184
+ }
9185
+ async invoke(request) {
9186
+ return invokePiAi({
9187
+ model: this.piModel,
9188
+ apiKey: this.apiKey,
9189
+ request,
9190
+ defaults: this.defaults,
9191
+ retryConfig: this.retryConfig
9163
9192
  });
9164
- this.model = google(config.model);
9165
9193
  }
9194
+ };
9195
+ AzureProvider = class {
9166
9196
  id;
9167
- kind = "gemini";
9197
+ kind = "azure";
9168
9198
  targetName;
9169
- model;
9199
+ piModel;
9170
9200
  defaults;
9171
9201
  retryConfig;
9202
+ apiKey;
9203
+ providerOptions;
9204
+ constructor(targetName, config) {
9205
+ this.id = `azure:${targetName}`;
9206
+ this.targetName = targetName;
9207
+ this.apiKey = config.apiKey;
9208
+ this.defaults = {
9209
+ temperature: config.temperature,
9210
+ maxOutputTokens: config.maxOutputTokens
9211
+ };
9212
+ this.retryConfig = config.retry;
9213
+ const trimmed = config.resourceName.trim();
9214
+ const isFullUrl = /^https?:\/\//i.test(trimmed);
9215
+ const baseUrl = isFullUrl ? buildAzureBaseUrl(trimmed) : void 0;
9216
+ this.providerOptions = {
9217
+ ...baseUrl ? { azureBaseUrl: baseUrl } : { azureResourceName: trimmed },
9218
+ ...config.version ? { azureApiVersion: config.version } : {}
9219
+ };
9220
+ this.piModel = resolvePiModel({
9221
+ providerName: "azure-openai-responses",
9222
+ apiId: "azure-openai-responses",
9223
+ // The "model id" for Azure is the deployment name.
9224
+ modelId: config.deploymentName,
9225
+ ...baseUrl ? { baseUrl } : {}
9226
+ });
9227
+ }
9172
9228
  async invoke(request) {
9173
- return invokeModel({
9174
- model: this.model,
9229
+ return invokePiAi({
9230
+ model: this.piModel,
9231
+ apiKey: this.apiKey,
9175
9232
  request,
9176
9233
  defaults: this.defaults,
9177
- retryConfig: this.retryConfig
9234
+ retryConfig: this.retryConfig,
9235
+ providerOptions: this.providerOptions
9178
9236
  });
9179
9237
  }
9180
- asLanguageModel() {
9181
- return this.model;
9238
+ };
9239
+ }
9240
+ });
9241
+
9242
+ // src/evaluation/providers/agentv-provider.ts
9243
+ var agentv_provider_exports = {};
9244
+ __export(agentv_provider_exports, {
9245
+ AgentvProvider: () => AgentvProvider
9246
+ });
9247
+ function parseAgentvModel(model) {
9248
+ const colonIndex = model.indexOf(":");
9249
+ if (colonIndex === -1) {
9250
+ throw new Error(
9251
+ `Invalid agentv model "${model}". Expected "provider:model" (e.g., "openai:gpt-5-mini").`
9252
+ );
9253
+ }
9254
+ const provider = model.slice(0, colonIndex);
9255
+ const modelId = model.slice(colonIndex + 1);
9256
+ switch (provider) {
9257
+ case "openai":
9258
+ return { providerName: "openai", apiId: "openai-completions", modelId };
9259
+ case "anthropic":
9260
+ return { providerName: "anthropic", apiId: "anthropic-messages", modelId };
9261
+ case "azure":
9262
+ return {
9263
+ providerName: "azure-openai-responses",
9264
+ apiId: "azure-openai-responses",
9265
+ modelId
9266
+ };
9267
+ case "google":
9268
+ return { providerName: "google", apiId: "google-generative-ai", modelId };
9269
+ default:
9270
+ throw new Error(
9271
+ `Unsupported agentv provider "${provider}" in "${model}". Supported: openai, anthropic, azure, google.`
9272
+ );
9273
+ }
9274
+ }
9275
+ var AgentvProvider;
9276
+ var init_agentv_provider = __esm({
9277
+ "src/evaluation/providers/agentv-provider.ts"() {
9278
+ "use strict";
9279
+ init_cjs_shims();
9280
+ init_llm_providers();
9281
+ AgentvProvider = class {
9282
+ id;
9283
+ kind = "agentv";
9284
+ targetName;
9285
+ piModel;
9286
+ defaults;
9287
+ constructor(targetName, config) {
9288
+ this.id = `agentv:${targetName}`;
9289
+ this.targetName = targetName;
9290
+ const { providerName, apiId, modelId } = parseAgentvModel(config.model);
9291
+ this.piModel = resolvePiModel({ providerName, apiId, modelId });
9292
+ this.defaults = { temperature: config.temperature };
9293
+ }
9294
+ async invoke(request) {
9295
+ return invokePiAi({
9296
+ model: this.piModel,
9297
+ request,
9298
+ defaults: this.defaults
9299
+ });
9182
9300
  }
9183
9301
  };
9184
9302
  }
@@ -13381,10 +13499,10 @@ function extractToolCallsFromEvents(events) {
13381
13499
  }
13382
13500
  }
13383
13501
  const toolCalls = [];
13384
- for (const [id, { tool: tool2, input }] of starts) {
13502
+ for (const [id, { tool, input }] of starts) {
13385
13503
  toolCalls.push(
13386
13504
  normalizeToolCall("pi-cli", {
13387
- tool: tool2,
13505
+ tool,
13388
13506
  input,
13389
13507
  id: id.startsWith("anon-") ? void 0 : id,
13390
13508
  output: results.get(id)
@@ -17765,7 +17883,6 @@ var init_providers = __esm({
17765
17883
  "use strict";
17766
17884
  init_cjs_shims();
17767
17885
  init_agentv_provider();
17768
- init_ai_sdk();
17769
17886
  init_claude_cli();
17770
17887
  init_claude_sdk();
17771
17888
  init_cli();
@@ -17773,6 +17890,7 @@ var init_providers = __esm({
17773
17890
  init_copilot_cli();
17774
17891
  init_copilot_log();
17775
17892
  init_copilot_sdk();
17893
+ init_llm_providers();
17776
17894
  init_mock();
17777
17895
  init_pi_cli();
17778
17896
  init_pi_coding_agent();
@@ -19799,6 +19917,19 @@ async function runEvaluation(options) {
19799
19917
  await dockerSetup.pullImage();
19800
19918
  setupLog("Docker image pull complete");
19801
19919
  }
19920
+ if (suiteWorkspace?.env) {
19921
+ try {
19922
+ await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
19923
+ setupLog("preflight checks passed");
19924
+ } catch (error) {
19925
+ const message = error instanceof Error ? error.message : String(error);
19926
+ if (sharedWorkspacePath && !useStaticWorkspace) {
19927
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
19928
+ });
19929
+ }
19930
+ throw new Error(message);
19931
+ }
19932
+ }
19802
19933
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
19803
19934
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
19804
19935
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -22069,6 +22200,38 @@ function computeWeightedMean(entries) {
22069
22200
  }
22070
22201
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
22071
22202
  }
22203
+ async function runPreflightChecks(env, cwd, log) {
22204
+ const execFileAsync5 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
22205
+ const missing = [];
22206
+ for (const cmd of env.required_commands ?? []) {
22207
+ log(`preflight: checking command "${cmd}"`);
22208
+ try {
22209
+ if (process.platform === "win32") {
22210
+ await execFileAsync5("where", [cmd], { cwd });
22211
+ } else {
22212
+ await execFileAsync5("sh", ["-c", `command -v ${cmd}`], { cwd });
22213
+ }
22214
+ } catch {
22215
+ missing.push(`command: ${cmd}`);
22216
+ }
22217
+ }
22218
+ for (const mod of env.required_python_modules ?? []) {
22219
+ log(`preflight: checking Python module "${mod}"`);
22220
+ try {
22221
+ await execFileAsync5("python3", ["-c", `import ${mod}`], { cwd });
22222
+ } catch {
22223
+ missing.push(`python module: ${mod}`);
22224
+ }
22225
+ }
22226
+ if (missing.length > 0) {
22227
+ throw new Error(
22228
+ `Preflight checks failed \u2014 missing dependencies:
22229
+ ${missing.map((m) => ` \u2022 ${m}`).join("\n")}
22230
+
22231
+ Install the missing dependencies before running this eval.`
22232
+ );
22233
+ }
22234
+ }
22072
22235
  var import_node_child_process11, import_node_crypto11, import_node_fs16, import_promises35, import_node_path47, import_node_util7, import_micromatch2, execFileAsync3, WORKSPACE_GIT_TIMEOUT_MS;
22073
22236
  var init_orchestrator = __esm({
22074
22237
  "src/evaluation/orchestrator.ts"() {
@@ -22931,7 +23094,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
22931
23094
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
22932
23095
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
22933
23096
  const docker = parseDockerWorkspaceConfig(obj.docker);
22934
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
23097
+ const env = parseWorkspaceEnvConfig(obj.env);
23098
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
22935
23099
  return void 0;
22936
23100
  return {
22937
23101
  ...template !== void 0 && { template },
@@ -22940,7 +23104,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
22940
23104
  ...hooks !== void 0 && { hooks },
22941
23105
  ...mode !== void 0 && { mode },
22942
23106
  ...workspacePath !== void 0 && { path: workspacePath },
22943
- ...docker !== void 0 && { docker }
23107
+ ...docker !== void 0 && { docker },
23108
+ ...env !== void 0 && { env }
23109
+ };
23110
+ }
23111
+ function parseWorkspaceEnvConfig(raw) {
23112
+ if (!isJsonObject(raw)) return void 0;
23113
+ const obj = raw;
23114
+ const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
23115
+ const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
23116
+ if (!required_commands?.length && !required_python_modules?.length) return void 0;
23117
+ return {
23118
+ ...required_commands?.length && { required_commands },
23119
+ ...required_python_modules?.length && { required_python_modules }
22944
23120
  };
22945
23121
  }
22946
23122
  function parseDockerWorkspaceConfig(raw) {
@@ -24966,8 +25142,8 @@ init_cjs_shims();
24966
25142
 
24967
25143
  // src/evaluation/generators/rubric-generator.ts
24968
25144
  init_cjs_shims();
24969
- var import_ai4 = require("ai");
24970
25145
  var import_zod6 = require("zod");
25146
+ init_types2();
24971
25147
  var rubricItemSchema = import_zod6.z.object({
24972
25148
  id: import_zod6.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
24973
25149
  outcome: import_zod6.z.string().describe("Concrete expected outcome for this rubric item"),
@@ -24980,10 +25156,6 @@ var rubricGenerationSchema = import_zod6.z.object({
24980
25156
  async function generateRubrics(options) {
24981
25157
  const { criteria, question, referenceAnswer, provider } = options;
24982
25158
  const prompt = buildPrompt(criteria, question, referenceAnswer);
24983
- const model = provider.asLanguageModel?.();
24984
- if (!model) {
24985
- throw new Error("Provider does not support language model interface");
24986
- }
24987
25159
  const system = `You are an expert at creating evaluation rubrics.
24988
25160
  You must return a valid JSON object matching this schema:
24989
25161
  {
@@ -25000,11 +25172,11 @@ You must return a valid JSON object matching this schema:
25000
25172
  let lastError;
25001
25173
  for (let attempt = 1; attempt <= 3; attempt++) {
25002
25174
  try {
25003
- const { text } = await (0, import_ai4.generateText)({
25004
- model,
25005
- system,
25006
- prompt
25175
+ const response = await provider.invoke({
25176
+ question: prompt,
25177
+ systemPrompt: system
25007
25178
  });
25179
+ const text = extractLastAssistantContent2(response.output);
25008
25180
  const cleaned = text.replace(/```json\n?|```/g, "").trim();
25009
25181
  result = rubricGenerationSchema.parse(JSON.parse(cleaned));
25010
25182
  break;