@agentv/core 4.25.1 → 4.25.2-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-MUIGGIP3.js +7 -0
- package/dist/chunk-5XV3FAAD.js +616 -0
- package/dist/chunk-5XV3FAAD.js.map +1 -0
- package/dist/{chunk-6HLBKYE2.js → chunk-CALQDF2Y.js} +1 -1
- package/dist/chunk-CALQDF2Y.js.map +1 -0
- package/dist/{chunk-IXTJEXWN.js → chunk-F234XBWV.js} +185 -551
- package/dist/chunk-F234XBWV.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +589 -419
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.js +7 -12
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-5JMF2N65.js +12 -0
- package/package.json +2 -7
- package/dist/agentv-provider-TXM4UEUT.js +0 -7
- package/dist/chunk-6HLBKYE2.js.map +0 -1
- package/dist/chunk-IXTJEXWN.js.map +0 -1
- package/dist/chunk-PRNXHNLF.js +0 -65
- package/dist/chunk-PRNXHNLF.js.map +0 -1
- package/dist/ts-eval-loader-4CFPGHGT.js +0 -12
- /package/dist/{agentv-provider-TXM4UEUT.js.map → agentv-provider-MUIGGIP3.js.map} +0 -0
- /package/dist/{ts-eval-loader-4CFPGHGT.js.map → ts-eval-loader-5JMF2N65.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -223,7 +223,7 @@ function computeTraceSummary(messages) {
|
|
|
223
223
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
224
224
|
if (summary.eventCount === 0) return void 0;
|
|
225
225
|
const explorationCalls = explorationTools.reduce(
|
|
226
|
-
(sum,
|
|
226
|
+
(sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
|
|
227
227
|
0
|
|
228
228
|
);
|
|
229
229
|
return explorationCalls / summary.eventCount;
|
|
@@ -5187,8 +5187,17 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
5187
5187
|
}
|
|
5188
5188
|
return result;
|
|
5189
5189
|
}
|
|
5190
|
+
async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5191
|
+
return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
5192
|
+
}
|
|
5190
5193
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5191
|
-
const { stdout, stderr, exitCode } =
|
|
5194
|
+
const { stdout, stderr, exitCode } = await runScriptRaw(
|
|
5195
|
+
scriptPath,
|
|
5196
|
+
input,
|
|
5197
|
+
agentTimeoutMs,
|
|
5198
|
+
cwd,
|
|
5199
|
+
env
|
|
5200
|
+
);
|
|
5192
5201
|
if (exitCode !== 0) {
|
|
5193
5202
|
const trimmedErr = formatStderr(stderr);
|
|
5194
5203
|
throw new Error(
|
|
@@ -5306,6 +5315,8 @@ var init_code_grader = __esm({
|
|
|
5306
5315
|
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
|
|
5307
5316
|
try {
|
|
5308
5317
|
let stdout;
|
|
5318
|
+
let exitCode = 0;
|
|
5319
|
+
let execStderr = "";
|
|
5309
5320
|
if (context2.dockerConfig) {
|
|
5310
5321
|
const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
|
|
5311
5322
|
const dockerProvider = new DockerWorkspaceProvider2(context2.dockerConfig);
|
|
@@ -5314,31 +5325,40 @@ var init_code_grader = __esm({
|
|
|
5314
5325
|
stdin: inputPayload,
|
|
5315
5326
|
repoCheckouts: getRepoCheckoutTargets(context2.evalCase.workspace?.repos)
|
|
5316
5327
|
});
|
|
5317
|
-
|
|
5318
|
-
const trimmedErr = result.stderr.trim();
|
|
5319
|
-
throw new Error(
|
|
5320
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
|
|
5321
|
-
);
|
|
5322
|
-
}
|
|
5328
|
+
exitCode = result.exitCode;
|
|
5323
5329
|
stdout = result.stdout.trim();
|
|
5330
|
+
execStderr = result.stderr;
|
|
5324
5331
|
} else {
|
|
5325
|
-
|
|
5332
|
+
const result = await runScriptRaw(
|
|
5326
5333
|
this.command,
|
|
5327
5334
|
inputPayload,
|
|
5328
5335
|
this.agentTimeoutMs,
|
|
5329
5336
|
this.cwd,
|
|
5330
5337
|
env
|
|
5331
5338
|
);
|
|
5339
|
+
exitCode = result.exitCode;
|
|
5340
|
+
stdout = result.stdout.trim();
|
|
5341
|
+
execStderr = result.stderr;
|
|
5332
5342
|
}
|
|
5333
|
-
const
|
|
5334
|
-
const
|
|
5335
|
-
|
|
5343
|
+
const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
|
|
5344
|
+
const hasStderr = execStderr.trim().length > 0;
|
|
5345
|
+
if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
|
|
5346
|
+
const trimmedErr = formatStderr(execStderr);
|
|
5347
|
+
throw new Error(
|
|
5348
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5349
|
+
);
|
|
5350
|
+
}
|
|
5351
|
+
const rawParsed = parseJsonSafe(stdout);
|
|
5352
|
+
const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
|
|
5353
|
+
const passed = exitCode === 0;
|
|
5354
|
+
const score = parsed != null ? clampScore(typeof parsed.score === "number" ? parsed.score : 0) : passed ? 1 : 0;
|
|
5355
|
+
const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
5336
5356
|
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
5337
5357
|
).map((a) => ({
|
|
5338
5358
|
text: String(a.text),
|
|
5339
5359
|
passed: Boolean(a.passed),
|
|
5340
5360
|
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
5341
|
-
})) : [];
|
|
5361
|
+
})) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
|
|
5342
5362
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
5343
5363
|
const proxyUsage = getProxyUsage?.();
|
|
5344
5364
|
const graderRawRequest = {
|
|
@@ -5646,13 +5666,6 @@ function extractImageBlocks(messages) {
|
|
|
5646
5666
|
}
|
|
5647
5667
|
return images;
|
|
5648
5668
|
}
|
|
5649
|
-
function toAiSdkImageParts(images) {
|
|
5650
|
-
return images.map((img) => ({
|
|
5651
|
-
type: "image",
|
|
5652
|
-
image: img.source,
|
|
5653
|
-
mediaType: img.media_type || void 0
|
|
5654
|
-
}));
|
|
5655
|
-
}
|
|
5656
5669
|
function resolveSandboxed(basePath, relativePath) {
|
|
5657
5670
|
const resolved = import_node_path12.default.resolve(basePath, relativePath);
|
|
5658
5671
|
if (!resolved.startsWith(basePath + import_node_path12.default.sep) && resolved !== basePath) {
|
|
@@ -5661,15 +5674,24 @@ function resolveSandboxed(basePath, relativePath) {
|
|
|
5661
5674
|
return resolved;
|
|
5662
5675
|
}
|
|
5663
5676
|
function createFilesystemTools(workspacePath) {
|
|
5664
|
-
return
|
|
5665
|
-
|
|
5677
|
+
return [
|
|
5678
|
+
{
|
|
5679
|
+
name: "list_files",
|
|
5666
5680
|
description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
|
|
5667
|
-
|
|
5668
|
-
|
|
5669
|
-
|
|
5681
|
+
parameters: {
|
|
5682
|
+
type: "object",
|
|
5683
|
+
properties: {
|
|
5684
|
+
path: {
|
|
5685
|
+
type: "string",
|
|
5686
|
+
description: 'Relative path within workspace (use "." for root)',
|
|
5687
|
+
default: "."
|
|
5688
|
+
}
|
|
5689
|
+
}
|
|
5690
|
+
},
|
|
5670
5691
|
execute: async (input) => {
|
|
5692
|
+
const args = input ?? {};
|
|
5671
5693
|
try {
|
|
5672
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5694
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
5673
5695
|
const entries = await import_promises12.default.readdir(resolved, { withFileTypes: true });
|
|
5674
5696
|
return entries.map((e) => ({
|
|
5675
5697
|
name: e.name,
|
|
@@ -5679,18 +5701,25 @@ function createFilesystemTools(workspacePath) {
|
|
|
5679
5701
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5680
5702
|
}
|
|
5681
5703
|
}
|
|
5682
|
-
}
|
|
5683
|
-
|
|
5704
|
+
},
|
|
5705
|
+
{
|
|
5706
|
+
name: "read_file",
|
|
5684
5707
|
description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
|
|
5685
|
-
|
|
5686
|
-
|
|
5687
|
-
|
|
5708
|
+
parameters: {
|
|
5709
|
+
type: "object",
|
|
5710
|
+
properties: {
|
|
5711
|
+
path: { type: "string", description: "Relative path to file within workspace" }
|
|
5712
|
+
},
|
|
5713
|
+
required: ["path"]
|
|
5714
|
+
},
|
|
5688
5715
|
execute: async (input) => {
|
|
5716
|
+
const args = input ?? {};
|
|
5717
|
+
const relPath = args.path ?? "";
|
|
5689
5718
|
try {
|
|
5690
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5719
|
+
const resolved = resolveSandboxed(workspacePath, relPath);
|
|
5691
5720
|
const stat14 = await import_promises12.default.stat(resolved);
|
|
5692
5721
|
if (stat14.isDirectory()) {
|
|
5693
|
-
return { error: `'${
|
|
5722
|
+
return { error: `'${relPath}' is a directory, not a file` };
|
|
5694
5723
|
}
|
|
5695
5724
|
const buffer = Buffer.alloc(Math.min(stat14.size, MAX_FILE_SIZE));
|
|
5696
5725
|
const fd = await import_promises12.default.open(resolved, "r");
|
|
@@ -5706,19 +5735,29 @@ function createFilesystemTools(workspacePath) {
|
|
|
5706
5735
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5707
5736
|
}
|
|
5708
5737
|
}
|
|
5709
|
-
}
|
|
5710
|
-
|
|
5738
|
+
},
|
|
5739
|
+
{
|
|
5740
|
+
name: "search_files",
|
|
5711
5741
|
description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
|
|
5712
|
-
|
|
5713
|
-
|
|
5714
|
-
|
|
5715
|
-
|
|
5742
|
+
parameters: {
|
|
5743
|
+
type: "object",
|
|
5744
|
+
properties: {
|
|
5745
|
+
pattern: { type: "string", description: "Regex pattern to search for" },
|
|
5746
|
+
path: {
|
|
5747
|
+
type: "string",
|
|
5748
|
+
description: 'Relative path to search within (use "." for root)',
|
|
5749
|
+
default: "."
|
|
5750
|
+
}
|
|
5751
|
+
},
|
|
5752
|
+
required: ["pattern"]
|
|
5753
|
+
},
|
|
5716
5754
|
execute: async (input) => {
|
|
5755
|
+
const args = input ?? {};
|
|
5717
5756
|
try {
|
|
5718
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5757
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
5719
5758
|
let regex;
|
|
5720
5759
|
try {
|
|
5721
|
-
regex = new RegExp(
|
|
5760
|
+
regex = new RegExp(args.pattern ?? "", "gi");
|
|
5722
5761
|
} catch (regexErr) {
|
|
5723
5762
|
return {
|
|
5724
5763
|
error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
|
|
@@ -5731,8 +5770,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
5731
5770
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5732
5771
|
}
|
|
5733
5772
|
}
|
|
5734
|
-
}
|
|
5735
|
-
|
|
5773
|
+
}
|
|
5774
|
+
];
|
|
5736
5775
|
}
|
|
5737
5776
|
async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
5738
5777
|
if (matches.length >= MAX_SEARCH_MATCHES) return;
|
|
@@ -5772,14 +5811,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
5772
5811
|
}
|
|
5773
5812
|
}
|
|
5774
5813
|
}
|
|
5775
|
-
var import_promises12, import_node_path12,
|
|
5814
|
+
var import_promises12, import_node_path12, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
|
|
5776
5815
|
var init_llm_grader = __esm({
|
|
5777
5816
|
"src/evaluation/graders/llm-grader.ts"() {
|
|
5778
5817
|
"use strict";
|
|
5779
5818
|
init_cjs_shims();
|
|
5780
5819
|
import_promises12 = __toESM(require("fs/promises"), 1);
|
|
5781
5820
|
import_node_path12 = __toESM(require("path"), 1);
|
|
5782
|
-
import_ai = require("ai");
|
|
5783
5821
|
import_zod2 = require("zod");
|
|
5784
5822
|
init_content_preprocessor();
|
|
5785
5823
|
init_content();
|
|
@@ -6095,18 +6133,15 @@ ${context2.toolCalls}`;
|
|
|
6095
6133
|
}
|
|
6096
6134
|
}
|
|
6097
6135
|
// ---------------------------------------------------------------------------
|
|
6098
|
-
// Built-in agent mode (agentv provider —
|
|
6136
|
+
// Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
|
|
6099
6137
|
// ---------------------------------------------------------------------------
|
|
6100
6138
|
/**
|
|
6101
|
-
* Built-in mode:
|
|
6139
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
6140
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
6141
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
6142
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
6102
6143
|
*/
|
|
6103
6144
|
async evaluateBuiltIn(context2, graderProvider) {
|
|
6104
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6105
|
-
if (!model) {
|
|
6106
|
-
throw new Error(
|
|
6107
|
-
`Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
|
|
6108
|
-
);
|
|
6109
|
-
}
|
|
6110
6145
|
const workspacePath = context2.workspacePath;
|
|
6111
6146
|
if (!workspacePath) {
|
|
6112
6147
|
throw new Error(
|
|
@@ -6125,18 +6160,21 @@ ${context2.toolCalls}`;
|
|
|
6125
6160
|
maxSteps: this.maxSteps
|
|
6126
6161
|
};
|
|
6127
6162
|
try {
|
|
6128
|
-
const
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6163
|
+
const response = await graderProvider.invoke({
|
|
6164
|
+
question: userPrompt,
|
|
6165
|
+
systemPrompt,
|
|
6166
|
+
evalCaseId: context2.evalCase.id,
|
|
6167
|
+
attempt: context2.attempt,
|
|
6168
|
+
temperature: this.temperature ?? 0,
|
|
6132
6169
|
tools: fsTools,
|
|
6133
|
-
|
|
6134
|
-
temperature: this.temperature ?? 0
|
|
6170
|
+
maxSteps: this.maxSteps
|
|
6135
6171
|
});
|
|
6136
|
-
const
|
|
6172
|
+
const text = extractLastAssistantContent2(response.output);
|
|
6173
|
+
const stepCount = response.steps?.count ?? 1;
|
|
6174
|
+
const toolCallCount = response.steps?.toolCallCount ?? 0;
|
|
6137
6175
|
const details = {
|
|
6138
6176
|
mode: "built-in",
|
|
6139
|
-
steps:
|
|
6177
|
+
steps: stepCount,
|
|
6140
6178
|
tool_calls: toolCallCount
|
|
6141
6179
|
};
|
|
6142
6180
|
return this.parseAgentResult(
|
|
@@ -6588,43 +6626,14 @@ ${outputSchema}`;
|
|
|
6588
6626
|
}
|
|
6589
6627
|
async generateStructuredResponse(options) {
|
|
6590
6628
|
const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
6591
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6592
|
-
if (model) {
|
|
6593
|
-
const modelOptions = {
|
|
6594
|
-
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
6595
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
6596
|
-
};
|
|
6597
|
-
const hasImages = images && images.length > 0;
|
|
6598
|
-
const result = hasImages ? await (0, import_ai.generateText)({
|
|
6599
|
-
model,
|
|
6600
|
-
system: systemPrompt,
|
|
6601
|
-
messages: [
|
|
6602
|
-
{
|
|
6603
|
-
role: "user",
|
|
6604
|
-
content: [
|
|
6605
|
-
{ type: "text", text: userPrompt },
|
|
6606
|
-
...toAiSdkImageParts(images)
|
|
6607
|
-
]
|
|
6608
|
-
}
|
|
6609
|
-
],
|
|
6610
|
-
...modelOptions
|
|
6611
|
-
}) : await (0, import_ai.generateText)({
|
|
6612
|
-
model,
|
|
6613
|
-
system: systemPrompt,
|
|
6614
|
-
prompt: userPrompt,
|
|
6615
|
-
...modelOptions
|
|
6616
|
-
});
|
|
6617
|
-
const rawUsage = result.usage;
|
|
6618
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
6619
|
-
return { text: result.text, tokenUsage };
|
|
6620
|
-
}
|
|
6621
6629
|
const response = await graderProvider.invoke({
|
|
6622
6630
|
question: userPrompt,
|
|
6623
6631
|
systemPrompt,
|
|
6624
6632
|
evalCaseId: context2.evalCase.id,
|
|
6625
6633
|
attempt: context2.attempt,
|
|
6626
6634
|
maxOutputTokens: this.maxOutputTokens,
|
|
6627
|
-
temperature: this.temperature
|
|
6635
|
+
temperature: this.temperature,
|
|
6636
|
+
...images && images.length > 0 ? { images } : {}
|
|
6628
6637
|
});
|
|
6629
6638
|
return {
|
|
6630
6639
|
text: extractLastAssistantContent2(response.output),
|
|
@@ -6640,12 +6649,11 @@ ${outputSchema}`;
|
|
|
6640
6649
|
});
|
|
6641
6650
|
|
|
6642
6651
|
// src/evaluation/graders/composite.ts
|
|
6643
|
-
var
|
|
6652
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
|
|
6644
6653
|
var init_composite = __esm({
|
|
6645
6654
|
"src/evaluation/graders/composite.ts"() {
|
|
6646
6655
|
"use strict";
|
|
6647
6656
|
init_cjs_shims();
|
|
6648
|
-
import_ai2 = require("ai");
|
|
6649
6657
|
init_types2();
|
|
6650
6658
|
init_code_grader();
|
|
6651
6659
|
init_llm_grader();
|
|
@@ -6888,25 +6896,6 @@ Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`
|
|
|
6888
6896
|
target: graderProvider.targetName
|
|
6889
6897
|
};
|
|
6890
6898
|
try {
|
|
6891
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6892
|
-
if (model) {
|
|
6893
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
6894
|
-
model,
|
|
6895
|
-
system: systemPrompt,
|
|
6896
|
-
prompt: userPrompt
|
|
6897
|
-
});
|
|
6898
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6899
|
-
const score2 = clampScore(data2.score);
|
|
6900
|
-
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
6901
|
-
return {
|
|
6902
|
-
score: score2,
|
|
6903
|
-
verdict: scoreToVerdict(score2),
|
|
6904
|
-
assertions: assertions2,
|
|
6905
|
-
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
6906
|
-
graderRawRequest,
|
|
6907
|
-
scores
|
|
6908
|
-
};
|
|
6909
|
-
}
|
|
6910
6899
|
const response = await graderProvider.invoke({
|
|
6911
6900
|
question: userPrompt,
|
|
6912
6901
|
systemPrompt,
|
|
@@ -8689,115 +8678,254 @@ var init_graders2 = __esm({
|
|
|
8689
8678
|
}
|
|
8690
8679
|
});
|
|
8691
8680
|
|
|
8692
|
-
// src/evaluation/providers/
|
|
8693
|
-
|
|
8694
|
-
|
|
8695
|
-
|
|
8696
|
-
}
|
|
8697
|
-
|
|
8698
|
-
|
|
8699
|
-
|
|
8700
|
-
|
|
8701
|
-
|
|
8681
|
+
// src/evaluation/providers/llm-providers.ts
|
|
8682
|
+
function buildAzureBaseUrl(input) {
|
|
8683
|
+
const trimmed = input.replace(/\/+$/, "");
|
|
8684
|
+
if (trimmed.endsWith("/openai/v1")) return trimmed;
|
|
8685
|
+
if (trimmed.endsWith("/openai")) return `${trimmed}/v1`;
|
|
8686
|
+
return `${trimmed}/openai/v1`;
|
|
8687
|
+
}
|
|
8688
|
+
async function invokePiAi(options) {
|
|
8689
|
+
const { model, apiKey, request, defaults, retryConfig, providerOptions } = options;
|
|
8690
|
+
const tools = request.tools && request.tools.length > 0 ? request.tools : void 0;
|
|
8691
|
+
const maxSteps = tools ? Math.max(1, request.maxSteps ?? 1) : 1;
|
|
8692
|
+
const { systemPrompt, messages } = chatPromptToPiContext(buildChatPrompt(request));
|
|
8693
|
+
if (request.images && request.images.length > 0) {
|
|
8694
|
+
attachImagesToLastUserMessage(messages, request.images);
|
|
8695
|
+
}
|
|
8696
|
+
const piTools = tools ? tools.map((t) => ({
|
|
8697
|
+
name: t.name,
|
|
8698
|
+
description: t.description,
|
|
8699
|
+
parameters: t.parameters
|
|
8700
|
+
})) : void 0;
|
|
8701
|
+
const ctx = { systemPrompt, messages, ...piTools ? { tools: piTools } : {} };
|
|
8702
|
+
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
8703
|
+
const callOptions = {
|
|
8704
|
+
...apiKey !== void 0 ? { apiKey } : {},
|
|
8705
|
+
temperature,
|
|
8706
|
+
...maxOutputTokens !== void 0 ? { maxTokens: maxOutputTokens } : {},
|
|
8707
|
+
signal: request.signal,
|
|
8708
|
+
...providerOptions ?? {}
|
|
8709
|
+
};
|
|
8710
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8711
|
+
const startMs = Date.now();
|
|
8712
|
+
const aggregateUsage = { input: 0, output: 0, cacheRead: 0, cost: 0 };
|
|
8713
|
+
let stepCount = 0;
|
|
8714
|
+
let toolCallCount = 0;
|
|
8715
|
+
let result = await withRetry(
|
|
8716
|
+
() => (0, import_pi_ai.complete)(model, ctx, callOptions),
|
|
8717
|
+
retryConfig,
|
|
8718
|
+
request.signal
|
|
8719
|
+
);
|
|
8720
|
+
ctx.messages.push(result);
|
|
8721
|
+
stepCount = 1;
|
|
8722
|
+
accumulateUsage(aggregateUsage, result.usage);
|
|
8723
|
+
while (tools) {
|
|
8724
|
+
const calls = result.content.filter(
|
|
8725
|
+
(b) => b.type === "toolCall"
|
|
8726
|
+
);
|
|
8727
|
+
if (calls.length === 0) break;
|
|
8728
|
+
if (stepCount >= maxSteps) break;
|
|
8729
|
+
toolCallCount += calls.length;
|
|
8730
|
+
for (const call of calls) {
|
|
8731
|
+
const tool = tools.find((t) => t.name === call.name);
|
|
8732
|
+
let output;
|
|
8733
|
+
let isError = false;
|
|
8734
|
+
try {
|
|
8735
|
+
if (!tool) {
|
|
8736
|
+
throw new Error(`pi-ai adapter: model called unknown tool '${call.name}'`);
|
|
8737
|
+
}
|
|
8738
|
+
output = await tool.execute(call.arguments);
|
|
8739
|
+
} catch (err) {
|
|
8740
|
+
output = err instanceof Error ? err.message : String(err);
|
|
8741
|
+
isError = true;
|
|
8742
|
+
}
|
|
8743
|
+
ctx.messages.push({
|
|
8744
|
+
role: "toolResult",
|
|
8745
|
+
toolCallId: call.id,
|
|
8746
|
+
toolName: call.name,
|
|
8747
|
+
content: [
|
|
8748
|
+
{ type: "text", text: typeof output === "string" ? output : JSON.stringify(output) }
|
|
8749
|
+
],
|
|
8750
|
+
isError,
|
|
8751
|
+
timestamp: Date.now()
|
|
8752
|
+
});
|
|
8753
|
+
}
|
|
8754
|
+
result = await withRetry(
|
|
8755
|
+
() => (0, import_pi_ai.complete)(model, ctx, callOptions),
|
|
8756
|
+
retryConfig,
|
|
8757
|
+
request.signal
|
|
8702
8758
|
);
|
|
8759
|
+
ctx.messages.push(result);
|
|
8760
|
+
stepCount += 1;
|
|
8761
|
+
accumulateUsage(aggregateUsage, result.usage);
|
|
8703
8762
|
}
|
|
8704
|
-
|
|
8705
|
-
|
|
8706
|
-
|
|
8707
|
-
|
|
8763
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8764
|
+
const durationMs = Date.now() - startMs;
|
|
8765
|
+
return mapPiResponse(result, {
|
|
8766
|
+
durationMs,
|
|
8767
|
+
startTime,
|
|
8768
|
+
endTime,
|
|
8769
|
+
aggregateUsage,
|
|
8770
|
+
steps: tools ? { count: stepCount, toolCallCount } : void 0
|
|
8771
|
+
});
|
|
8708
8772
|
}
|
|
8709
|
-
function
|
|
8710
|
-
|
|
8711
|
-
|
|
8712
|
-
|
|
8713
|
-
|
|
8714
|
-
|
|
8715
|
-
|
|
8716
|
-
|
|
8717
|
-
|
|
8718
|
-
|
|
8719
|
-
|
|
8720
|
-
|
|
8773
|
+
function accumulateUsage(agg, u) {
|
|
8774
|
+
agg.input += u.input;
|
|
8775
|
+
agg.output += u.output;
|
|
8776
|
+
agg.cacheRead += u.cacheRead;
|
|
8777
|
+
agg.cost += u.cost.total;
|
|
8778
|
+
}
|
|
8779
|
+
function resolvePiModel(args) {
|
|
8780
|
+
const { providerName, apiId, modelId, baseUrl } = args;
|
|
8781
|
+
let model;
|
|
8782
|
+
try {
|
|
8783
|
+
model = (0, import_pi_ai.getModel)(providerName, modelId);
|
|
8784
|
+
} catch {
|
|
8785
|
+
model = void 0;
|
|
8786
|
+
}
|
|
8787
|
+
if (!model) {
|
|
8788
|
+
const fallbackBaseUrl = baseUrl ?? defaultBaseUrlFor(providerName);
|
|
8789
|
+
if (!fallbackBaseUrl) {
|
|
8721
8790
|
throw new Error(
|
|
8722
|
-
`
|
|
8791
|
+
`pi-ai adapter cannot resolve a baseUrl for provider '${providerName}' / model '${modelId}'. Either set the target's baseUrl/endpoint or use a model id pi-ai recognizes.`
|
|
8723
8792
|
);
|
|
8724
|
-
|
|
8725
|
-
|
|
8726
|
-
|
|
8727
|
-
|
|
8728
|
-
|
|
8729
|
-
|
|
8730
|
-
|
|
8731
|
-
|
|
8732
|
-
|
|
8733
|
-
|
|
8734
|
-
|
|
8735
|
-
|
|
8736
|
-
id;
|
|
8737
|
-
kind = "agentv";
|
|
8738
|
-
targetName;
|
|
8739
|
-
model;
|
|
8740
|
-
constructor(targetName, config) {
|
|
8741
|
-
this.id = `agentv:${targetName}`;
|
|
8742
|
-
this.targetName = targetName;
|
|
8743
|
-
this.model = createLanguageModel(config.model);
|
|
8744
|
-
}
|
|
8745
|
-
/**
|
|
8746
|
-
* Direct invoke is not supported for the agentv provider.
|
|
8747
|
-
* Use asLanguageModel() with generateText() instead.
|
|
8748
|
-
*/
|
|
8749
|
-
async invoke(_request) {
|
|
8750
|
-
throw new Error(
|
|
8751
|
-
"AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead."
|
|
8752
|
-
);
|
|
8753
|
-
}
|
|
8754
|
-
/**
|
|
8755
|
-
* Returns the resolved AI SDK LanguageModel for use with generateText/generateObject.
|
|
8756
|
-
*/
|
|
8757
|
-
asLanguageModel() {
|
|
8758
|
-
return this.model;
|
|
8759
|
-
}
|
|
8793
|
+
}
|
|
8794
|
+
model = {
|
|
8795
|
+
id: modelId,
|
|
8796
|
+
name: modelId,
|
|
8797
|
+
api: apiId,
|
|
8798
|
+
provider: providerName,
|
|
8799
|
+
baseUrl: fallbackBaseUrl,
|
|
8800
|
+
reasoning: false,
|
|
8801
|
+
input: ["text"],
|
|
8802
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
8803
|
+
contextWindow: 128e3,
|
|
8804
|
+
maxTokens: 16384
|
|
8760
8805
|
};
|
|
8761
8806
|
}
|
|
8762
|
-
|
|
8763
|
-
|
|
8764
|
-
|
|
8765
|
-
|
|
8766
|
-
|
|
8767
|
-
apiKey: config.apiKey,
|
|
8768
|
-
apiVersion: config.version,
|
|
8769
|
-
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
8770
|
-
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
8771
|
-
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
8772
|
-
};
|
|
8773
|
-
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
8774
|
-
if (baseURL) {
|
|
8775
|
-
options.baseURL = baseURL;
|
|
8776
|
-
} else {
|
|
8777
|
-
options.resourceName = config.resourceName;
|
|
8807
|
+
if (model.api !== apiId) {
|
|
8808
|
+
model = { ...model, api: apiId };
|
|
8809
|
+
}
|
|
8810
|
+
if (baseUrl) {
|
|
8811
|
+
model = { ...model, baseUrl };
|
|
8778
8812
|
}
|
|
8779
|
-
return
|
|
8813
|
+
return model;
|
|
8780
8814
|
}
|
|
8781
|
-
function
|
|
8782
|
-
|
|
8783
|
-
if (
|
|
8784
|
-
|
|
8815
|
+
function defaultBaseUrlFor(providerName) {
|
|
8816
|
+
if (providerName === "openai") return "https://api.openai.com/v1";
|
|
8817
|
+
if (providerName === "openrouter") return "https://openrouter.ai/api/v1";
|
|
8818
|
+
return void 0;
|
|
8819
|
+
}
|
|
8820
|
+
function chatPromptToPiContext(chatPrompt) {
|
|
8821
|
+
const systemSegments = [];
|
|
8822
|
+
const messages = [];
|
|
8823
|
+
const now = Date.now();
|
|
8824
|
+
for (const message of chatPrompt) {
|
|
8825
|
+
if (message.role === "system") {
|
|
8826
|
+
systemSegments.push(message.content);
|
|
8827
|
+
continue;
|
|
8828
|
+
}
|
|
8829
|
+
if (message.role === "user") {
|
|
8830
|
+
messages.push({ role: "user", content: message.content, timestamp: now });
|
|
8831
|
+
continue;
|
|
8832
|
+
}
|
|
8833
|
+
if (message.role === "assistant") {
|
|
8834
|
+
messages.push({
|
|
8835
|
+
role: "assistant",
|
|
8836
|
+
content: [{ type: "text", text: message.content }],
|
|
8837
|
+
api: "",
|
|
8838
|
+
provider: "",
|
|
8839
|
+
model: "",
|
|
8840
|
+
usage: {
|
|
8841
|
+
input: 0,
|
|
8842
|
+
output: 0,
|
|
8843
|
+
cacheRead: 0,
|
|
8844
|
+
cacheWrite: 0,
|
|
8845
|
+
totalTokens: 0,
|
|
8846
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
|
|
8847
|
+
},
|
|
8848
|
+
stopReason: "stop",
|
|
8849
|
+
timestamp: now
|
|
8850
|
+
});
|
|
8851
|
+
continue;
|
|
8852
|
+
}
|
|
8853
|
+
if (message.role === "tool" || message.role === "function") {
|
|
8854
|
+
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
8855
|
+
messages.push({
|
|
8856
|
+
role: "assistant",
|
|
8857
|
+
content: [{ type: "text", text: `${prefix}${message.content}` }],
|
|
8858
|
+
api: "",
|
|
8859
|
+
provider: "",
|
|
8860
|
+
model: "",
|
|
8861
|
+
usage: {
|
|
8862
|
+
input: 0,
|
|
8863
|
+
output: 0,
|
|
8864
|
+
cacheRead: 0,
|
|
8865
|
+
cacheWrite: 0,
|
|
8866
|
+
totalTokens: 0,
|
|
8867
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
|
|
8868
|
+
},
|
|
8869
|
+
stopReason: "stop",
|
|
8870
|
+
timestamp: now
|
|
8871
|
+
});
|
|
8872
|
+
continue;
|
|
8873
|
+
}
|
|
8874
|
+
throw new Error(`pi-ai adapter received unsupported message role '${message.role}'.`);
|
|
8785
8875
|
}
|
|
8786
|
-
|
|
8787
|
-
|
|
8788
|
-
|
|
8876
|
+
return {
|
|
8877
|
+
systemPrompt: systemSegments.length > 0 ? systemSegments.join("\n\n") : void 0,
|
|
8878
|
+
messages
|
|
8879
|
+
};
|
|
8789
8880
|
}
|
|
8790
|
-
function
|
|
8791
|
-
if (
|
|
8792
|
-
|
|
8881
|
+
function attachImagesToLastUserMessage(messages, images) {
|
|
8882
|
+
if (!images || images.length === 0) return;
|
|
8883
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
8884
|
+
const m = messages[i];
|
|
8885
|
+
if (m.role !== "user") continue;
|
|
8886
|
+
const text = typeof m.content === "string" ? m.content : "";
|
|
8887
|
+
messages[i] = {
|
|
8888
|
+
...m,
|
|
8889
|
+
content: [
|
|
8890
|
+
...text ? [{ type: "text", text }] : [],
|
|
8891
|
+
...images.map((img) => ({
|
|
8892
|
+
type: "image",
|
|
8893
|
+
data: img.source,
|
|
8894
|
+
mimeType: img.media_type
|
|
8895
|
+
}))
|
|
8896
|
+
]
|
|
8897
|
+
};
|
|
8898
|
+
return;
|
|
8793
8899
|
}
|
|
8900
|
+
messages.push({
|
|
8901
|
+
role: "user",
|
|
8902
|
+
content: images.map((img) => ({
|
|
8903
|
+
type: "image",
|
|
8904
|
+
data: img.source,
|
|
8905
|
+
mimeType: img.media_type
|
|
8906
|
+
})),
|
|
8907
|
+
timestamp: Date.now()
|
|
8908
|
+
});
|
|
8909
|
+
}
|
|
8910
|
+
function mapPiResponse(result, timing) {
|
|
8911
|
+
const text = result.content.filter((b) => b.type === "text").map((b) => b.text).join("");
|
|
8912
|
+
const cached = timing.aggregateUsage.cacheRead > 0 ? timing.aggregateUsage.cacheRead : void 0;
|
|
8913
|
+
const tokenUsage = {
|
|
8914
|
+
input: timing.aggregateUsage.input,
|
|
8915
|
+
output: timing.aggregateUsage.output,
|
|
8916
|
+
...cached !== void 0 ? { cached } : {}
|
|
8917
|
+
};
|
|
8918
|
+
const costUsd = timing.aggregateUsage.cost > 0 ? timing.aggregateUsage.cost : void 0;
|
|
8794
8919
|
return {
|
|
8795
|
-
|
|
8796
|
-
|
|
8797
|
-
|
|
8798
|
-
|
|
8799
|
-
|
|
8800
|
-
|
|
8920
|
+
raw: result,
|
|
8921
|
+
usage: toJsonObject(result.usage),
|
|
8922
|
+
output: [{ role: "assistant", content: text }],
|
|
8923
|
+
tokenUsage,
|
|
8924
|
+
...costUsd !== void 0 ? { costUsd } : {},
|
|
8925
|
+
durationMs: timing.durationMs,
|
|
8926
|
+
startTime: timing.startTime,
|
|
8927
|
+
endTime: timing.endTime,
|
|
8928
|
+
...timing.steps ? { steps: timing.steps } : {}
|
|
8801
8929
|
};
|
|
8802
8930
|
}
|
|
8803
8931
|
function buildChatPrompt(request) {
|
|
@@ -8812,92 +8940,21 @@ function buildChatPrompt(request) {
|
|
|
8812
8940
|
}
|
|
8813
8941
|
const systemContent = resolveSystemContent(request);
|
|
8814
8942
|
const userContent = request.question.trim();
|
|
8815
|
-
|
|
8943
|
+
return [
|
|
8816
8944
|
{ role: "system", content: systemContent },
|
|
8817
8945
|
{ role: "user", content: userContent }
|
|
8818
8946
|
];
|
|
8819
|
-
return prompt;
|
|
8820
8947
|
}
|
|
8821
8948
|
function resolveSystemContent(request) {
|
|
8822
|
-
const systemSegments = [];
|
|
8823
8949
|
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
8824
|
-
|
|
8825
|
-
} else {
|
|
8826
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
8950
|
+
return request.systemPrompt.trim();
|
|
8827
8951
|
}
|
|
8828
|
-
return
|
|
8829
|
-
}
|
|
8830
|
-
function toModelMessages(chatPrompt) {
|
|
8831
|
-
return chatPrompt.map((message) => {
|
|
8832
|
-
if (message.role === "tool" || message.role === "function") {
|
|
8833
|
-
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
8834
|
-
return {
|
|
8835
|
-
role: "assistant",
|
|
8836
|
-
content: `${prefix}${message.content}`
|
|
8837
|
-
};
|
|
8838
|
-
}
|
|
8839
|
-
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
8840
|
-
return {
|
|
8841
|
-
role: message.role,
|
|
8842
|
-
content: message.content
|
|
8843
|
-
};
|
|
8844
|
-
}
|
|
8845
|
-
return {
|
|
8846
|
-
role: "user",
|
|
8847
|
-
content: message.content
|
|
8848
|
-
};
|
|
8849
|
-
});
|
|
8952
|
+
return DEFAULT_SYSTEM_PROMPT;
|
|
8850
8953
|
}
|
|
8851
8954
|
function resolveModelSettings(request, defaults) {
|
|
8852
|
-
const temperature = request.temperature ?? defaults.temperature;
|
|
8853
|
-
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
8854
8955
|
return {
|
|
8855
|
-
temperature,
|
|
8856
|
-
maxOutputTokens
|
|
8857
|
-
};
|
|
8858
|
-
}
|
|
8859
|
-
async function invokeModel(options) {
|
|
8860
|
-
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
8861
|
-
const chatPrompt = buildChatPrompt(request);
|
|
8862
|
-
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
8863
|
-
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8864
|
-
const startMs = Date.now();
|
|
8865
|
-
const result = await withRetry(
|
|
8866
|
-
() => (0, import_ai3.generateText)({
|
|
8867
|
-
model,
|
|
8868
|
-
messages: toModelMessages(chatPrompt),
|
|
8869
|
-
temperature,
|
|
8870
|
-
maxOutputTokens,
|
|
8871
|
-
maxRetries: 0,
|
|
8872
|
-
abortSignal: request.signal,
|
|
8873
|
-
...providerOptions ? { providerOptions } : {}
|
|
8874
|
-
}),
|
|
8875
|
-
retryConfig,
|
|
8876
|
-
request.signal
|
|
8877
|
-
);
|
|
8878
|
-
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8879
|
-
const durationMs = Date.now() - startMs;
|
|
8880
|
-
return mapResponse(result, { durationMs, startTime, endTime });
|
|
8881
|
-
}
|
|
8882
|
-
function mapResponse(result, timing) {
|
|
8883
|
-
const content = result.text ?? "";
|
|
8884
|
-
const rawUsage = result.totalUsage ?? result.usage;
|
|
8885
|
-
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
8886
|
-
const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
|
|
8887
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
|
|
8888
|
-
input: rawUsage.inputTokens,
|
|
8889
|
-
output: rawUsage.outputTokens,
|
|
8890
|
-
...reasoning != null ? { reasoning } : {},
|
|
8891
|
-
...cached != null ? { cached } : {}
|
|
8892
|
-
} : void 0;
|
|
8893
|
-
return {
|
|
8894
|
-
raw: result,
|
|
8895
|
-
usage: toJsonObject(rawUsage),
|
|
8896
|
-
output: [{ role: "assistant", content }],
|
|
8897
|
-
tokenUsage,
|
|
8898
|
-
durationMs: timing?.durationMs,
|
|
8899
|
-
startTime: timing?.startTime,
|
|
8900
|
-
endTime: timing?.endTime
|
|
8956
|
+
temperature: request.temperature ?? defaults.temperature,
|
|
8957
|
+
maxOutputTokens: request.maxOutputTokens ?? defaults.maxOutputTokens
|
|
8901
8958
|
};
|
|
8902
8959
|
}
|
|
8903
8960
|
function toJsonObject(value) {
|
|
@@ -8911,9 +8968,7 @@ function toJsonObject(value) {
|
|
|
8911
8968
|
}
|
|
8912
8969
|
}
|
|
8913
8970
|
function extractStatus(error) {
|
|
8914
|
-
if (!error || typeof error !== "object")
|
|
8915
|
-
return void 0;
|
|
8916
|
-
}
|
|
8971
|
+
if (!error || typeof error !== "object") return void 0;
|
|
8917
8972
|
const candidate = error;
|
|
8918
8973
|
const directStatus = candidate.status ?? candidate.statusCode;
|
|
8919
8974
|
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
@@ -8928,21 +8983,15 @@ function extractStatus(error) {
|
|
|
8928
8983
|
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
8929
8984
|
if (match) {
|
|
8930
8985
|
const parsed = Number.parseInt(match[1], 10);
|
|
8931
|
-
if (Number.isFinite(parsed))
|
|
8932
|
-
return parsed;
|
|
8933
|
-
}
|
|
8986
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
8934
8987
|
}
|
|
8935
8988
|
}
|
|
8936
8989
|
return void 0;
|
|
8937
8990
|
}
|
|
8938
8991
|
function isNetworkError(error) {
|
|
8939
|
-
if (!error || typeof error !== "object")
|
|
8940
|
-
return false;
|
|
8941
|
-
}
|
|
8992
|
+
if (!error || typeof error !== "object") return false;
|
|
8942
8993
|
const candidate = error;
|
|
8943
|
-
if (candidate.name === "AbortError")
|
|
8944
|
-
return false;
|
|
8945
|
-
}
|
|
8994
|
+
if (candidate.name === "AbortError") return false;
|
|
8946
8995
|
const code = candidate.code;
|
|
8947
8996
|
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
8948
8997
|
return true;
|
|
@@ -8955,12 +9004,8 @@ function isNetworkError(error) {
|
|
|
8955
9004
|
}
|
|
8956
9005
|
function isRetryableError(error, retryableStatusCodes) {
|
|
8957
9006
|
const status = extractStatus(error);
|
|
8958
|
-
if (status === 401 || status === 403)
|
|
8959
|
-
|
|
8960
|
-
}
|
|
8961
|
-
if (typeof status === "number") {
|
|
8962
|
-
return retryableStatusCodes.includes(status);
|
|
8963
|
-
}
|
|
9007
|
+
if (status === 401 || status === 403) return false;
|
|
9008
|
+
if (typeof status === "number") return retryableStatusCodes.includes(status);
|
|
8964
9009
|
return isNetworkError(error);
|
|
8965
9010
|
}
|
|
8966
9011
|
function calculateRetryDelay(attempt, config) {
|
|
@@ -8990,195 +9035,266 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
8990
9035
|
return await fn();
|
|
8991
9036
|
} catch (error) {
|
|
8992
9037
|
lastError = error;
|
|
8993
|
-
if (attempt >= config.maxRetries)
|
|
8994
|
-
|
|
8995
|
-
}
|
|
8996
|
-
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
8997
|
-
throw error;
|
|
8998
|
-
}
|
|
9038
|
+
if (attempt >= config.maxRetries) break;
|
|
9039
|
+
if (!isRetryableError(error, config.retryableStatusCodes)) throw error;
|
|
8999
9040
|
const delay = calculateRetryDelay(attempt, config);
|
|
9000
9041
|
await sleep(delay);
|
|
9001
9042
|
}
|
|
9002
9043
|
}
|
|
9003
9044
|
throw lastError;
|
|
9004
9045
|
}
|
|
9005
|
-
var
|
|
9006
|
-
var
|
|
9007
|
-
"src/evaluation/providers/
|
|
9046
|
+
var import_pi_ai, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider, AzureProvider;
|
|
9047
|
+
var init_llm_providers = __esm({
|
|
9048
|
+
"src/evaluation/providers/llm-providers.ts"() {
|
|
9008
9049
|
"use strict";
|
|
9009
9050
|
init_cjs_shims();
|
|
9010
|
-
|
|
9011
|
-
|
|
9012
|
-
import_google2 = require("@ai-sdk/google");
|
|
9013
|
-
import_openai2 = require("@ai-sdk/openai");
|
|
9014
|
-
import_ai_sdk_provider = require("@openrouter/ai-sdk-provider");
|
|
9015
|
-
import_ai3 = require("ai");
|
|
9051
|
+
import_pi_ai = require("@mariozechner/pi-ai");
|
|
9052
|
+
(0, import_pi_ai.registerBuiltInApiProviders)();
|
|
9016
9053
|
DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
9017
9054
|
OpenAIProvider = class {
|
|
9018
|
-
constructor(targetName, config) {
|
|
9019
|
-
this.config = config;
|
|
9020
|
-
this.id = `openai:${targetName}`;
|
|
9021
|
-
this.targetName = targetName;
|
|
9022
|
-
this.defaults = {
|
|
9023
|
-
temperature: config.temperature,
|
|
9024
|
-
maxOutputTokens: config.maxOutputTokens
|
|
9025
|
-
};
|
|
9026
|
-
this.retryConfig = config.retry;
|
|
9027
|
-
const openai = (0, import_openai2.createOpenAI)({
|
|
9028
|
-
apiKey: config.apiKey,
|
|
9029
|
-
baseURL: config.baseURL
|
|
9030
|
-
});
|
|
9031
|
-
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
9032
|
-
}
|
|
9033
9055
|
id;
|
|
9034
9056
|
kind = "openai";
|
|
9035
9057
|
targetName;
|
|
9036
|
-
|
|
9058
|
+
piModel;
|
|
9037
9059
|
defaults;
|
|
9038
9060
|
retryConfig;
|
|
9039
|
-
|
|
9040
|
-
return invokeModel({
|
|
9041
|
-
model: this.model,
|
|
9042
|
-
request,
|
|
9043
|
-
defaults: this.defaults,
|
|
9044
|
-
retryConfig: this.retryConfig
|
|
9045
|
-
});
|
|
9046
|
-
}
|
|
9047
|
-
asLanguageModel() {
|
|
9048
|
-
return this.model;
|
|
9049
|
-
}
|
|
9050
|
-
};
|
|
9051
|
-
AzureProvider = class {
|
|
9061
|
+
apiKey;
|
|
9052
9062
|
constructor(targetName, config) {
|
|
9053
|
-
this.
|
|
9054
|
-
this.id = `azure:${targetName}`;
|
|
9063
|
+
this.id = `openai:${targetName}`;
|
|
9055
9064
|
this.targetName = targetName;
|
|
9065
|
+
this.apiKey = config.apiKey;
|
|
9056
9066
|
this.defaults = {
|
|
9057
9067
|
temperature: config.temperature,
|
|
9058
9068
|
maxOutputTokens: config.maxOutputTokens
|
|
9059
9069
|
};
|
|
9060
9070
|
this.retryConfig = config.retry;
|
|
9061
|
-
|
|
9062
|
-
|
|
9071
|
+
this.piModel = resolvePiModel({
|
|
9072
|
+
providerName: "openai",
|
|
9073
|
+
apiId: config.apiFormat === "responses" ? "openai-responses" : "openai-completions",
|
|
9074
|
+
modelId: config.model,
|
|
9075
|
+
baseUrl: config.baseURL
|
|
9076
|
+
});
|
|
9063
9077
|
}
|
|
9064
|
-
id;
|
|
9065
|
-
kind = "azure";
|
|
9066
|
-
targetName;
|
|
9067
|
-
model;
|
|
9068
|
-
defaults;
|
|
9069
|
-
retryConfig;
|
|
9070
9078
|
async invoke(request) {
|
|
9071
|
-
return
|
|
9072
|
-
model: this.
|
|
9079
|
+
return invokePiAi({
|
|
9080
|
+
model: this.piModel,
|
|
9081
|
+
apiKey: this.apiKey,
|
|
9073
9082
|
request,
|
|
9074
9083
|
defaults: this.defaults,
|
|
9075
9084
|
retryConfig: this.retryConfig
|
|
9076
9085
|
});
|
|
9077
9086
|
}
|
|
9078
|
-
asLanguageModel() {
|
|
9079
|
-
return this.model;
|
|
9080
|
-
}
|
|
9081
9087
|
};
|
|
9082
9088
|
OpenRouterProvider = class {
|
|
9089
|
+
id;
|
|
9090
|
+
kind = "openrouter";
|
|
9091
|
+
targetName;
|
|
9092
|
+
piModel;
|
|
9093
|
+
defaults;
|
|
9094
|
+
retryConfig;
|
|
9095
|
+
apiKey;
|
|
9083
9096
|
constructor(targetName, config) {
|
|
9084
|
-
this.config = config;
|
|
9085
9097
|
this.id = `openrouter:${targetName}`;
|
|
9086
9098
|
this.targetName = targetName;
|
|
9099
|
+
this.apiKey = config.apiKey;
|
|
9087
9100
|
this.defaults = {
|
|
9088
9101
|
temperature: config.temperature,
|
|
9089
9102
|
maxOutputTokens: config.maxOutputTokens
|
|
9090
9103
|
};
|
|
9091
9104
|
this.retryConfig = config.retry;
|
|
9092
|
-
|
|
9093
|
-
|
|
9105
|
+
this.piModel = resolvePiModel({
|
|
9106
|
+
providerName: "openrouter",
|
|
9107
|
+
apiId: "openai-completions",
|
|
9108
|
+
modelId: config.model,
|
|
9109
|
+
baseUrl: "https://openrouter.ai/api/v1"
|
|
9094
9110
|
});
|
|
9095
|
-
this.model = openrouter(config.model);
|
|
9096
9111
|
}
|
|
9097
|
-
id;
|
|
9098
|
-
kind = "openrouter";
|
|
9099
|
-
targetName;
|
|
9100
|
-
model;
|
|
9101
|
-
defaults;
|
|
9102
|
-
retryConfig;
|
|
9103
9112
|
async invoke(request) {
|
|
9104
|
-
return
|
|
9105
|
-
model: this.
|
|
9113
|
+
return invokePiAi({
|
|
9114
|
+
model: this.piModel,
|
|
9115
|
+
apiKey: this.apiKey,
|
|
9106
9116
|
request,
|
|
9107
9117
|
defaults: this.defaults,
|
|
9108
9118
|
retryConfig: this.retryConfig
|
|
9109
9119
|
});
|
|
9110
9120
|
}
|
|
9111
|
-
asLanguageModel() {
|
|
9112
|
-
return this.model;
|
|
9113
|
-
}
|
|
9114
9121
|
};
|
|
9115
9122
|
AnthropicProvider = class {
|
|
9123
|
+
id;
|
|
9124
|
+
kind = "anthropic";
|
|
9125
|
+
targetName;
|
|
9126
|
+
piModel;
|
|
9127
|
+
defaults;
|
|
9128
|
+
retryConfig;
|
|
9129
|
+
apiKey;
|
|
9130
|
+
thinkingBudget;
|
|
9116
9131
|
constructor(targetName, config) {
|
|
9117
|
-
this.config = config;
|
|
9118
9132
|
this.id = `anthropic:${targetName}`;
|
|
9119
9133
|
this.targetName = targetName;
|
|
9134
|
+
this.apiKey = config.apiKey;
|
|
9135
|
+
this.thinkingBudget = config.thinkingBudget;
|
|
9120
9136
|
this.defaults = {
|
|
9121
9137
|
temperature: config.temperature,
|
|
9122
9138
|
maxOutputTokens: config.maxOutputTokens,
|
|
9123
9139
|
thinkingBudget: config.thinkingBudget
|
|
9124
9140
|
};
|
|
9125
9141
|
this.retryConfig = config.retry;
|
|
9126
|
-
|
|
9127
|
-
|
|
9142
|
+
this.piModel = resolvePiModel({
|
|
9143
|
+
providerName: "anthropic",
|
|
9144
|
+
apiId: "anthropic-messages",
|
|
9145
|
+
modelId: config.model
|
|
9128
9146
|
});
|
|
9129
|
-
this.model = anthropic(config.model);
|
|
9130
9147
|
}
|
|
9131
|
-
id;
|
|
9132
|
-
kind = "anthropic";
|
|
9133
|
-
targetName;
|
|
9134
|
-
model;
|
|
9135
|
-
defaults;
|
|
9136
|
-
retryConfig;
|
|
9137
9148
|
async invoke(request) {
|
|
9138
|
-
const providerOptions =
|
|
9139
|
-
return
|
|
9140
|
-
model: this.
|
|
9149
|
+
const providerOptions = this.thinkingBudget !== void 0 ? { thinkingEnabled: true, thinkingBudgetTokens: this.thinkingBudget } : void 0;
|
|
9150
|
+
return invokePiAi({
|
|
9151
|
+
model: this.piModel,
|
|
9152
|
+
apiKey: this.apiKey,
|
|
9141
9153
|
request,
|
|
9142
9154
|
defaults: this.defaults,
|
|
9143
9155
|
retryConfig: this.retryConfig,
|
|
9144
|
-
providerOptions
|
|
9156
|
+
...providerOptions ? { providerOptions } : {}
|
|
9145
9157
|
});
|
|
9146
9158
|
}
|
|
9147
|
-
asLanguageModel() {
|
|
9148
|
-
return this.model;
|
|
9149
|
-
}
|
|
9150
9159
|
};
|
|
9151
9160
|
GeminiProvider = class {
|
|
9161
|
+
id;
|
|
9162
|
+
kind = "gemini";
|
|
9163
|
+
targetName;
|
|
9164
|
+
piModel;
|
|
9165
|
+
defaults;
|
|
9166
|
+
retryConfig;
|
|
9167
|
+
apiKey;
|
|
9152
9168
|
constructor(targetName, config) {
|
|
9153
|
-
this.config = config;
|
|
9154
9169
|
this.id = `gemini:${targetName}`;
|
|
9155
9170
|
this.targetName = targetName;
|
|
9171
|
+
this.apiKey = config.apiKey;
|
|
9156
9172
|
this.defaults = {
|
|
9157
9173
|
temperature: config.temperature,
|
|
9158
9174
|
maxOutputTokens: config.maxOutputTokens
|
|
9159
9175
|
};
|
|
9160
9176
|
this.retryConfig = config.retry;
|
|
9161
|
-
|
|
9162
|
-
|
|
9177
|
+
this.piModel = resolvePiModel({
|
|
9178
|
+
providerName: "google",
|
|
9179
|
+
apiId: "google-generative-ai",
|
|
9180
|
+
modelId: config.model
|
|
9181
|
+
});
|
|
9182
|
+
}
|
|
9183
|
+
async invoke(request) {
|
|
9184
|
+
return invokePiAi({
|
|
9185
|
+
model: this.piModel,
|
|
9186
|
+
apiKey: this.apiKey,
|
|
9187
|
+
request,
|
|
9188
|
+
defaults: this.defaults,
|
|
9189
|
+
retryConfig: this.retryConfig
|
|
9163
9190
|
});
|
|
9164
|
-
this.model = google(config.model);
|
|
9165
9191
|
}
|
|
9192
|
+
};
|
|
9193
|
+
AzureProvider = class {
|
|
9166
9194
|
id;
|
|
9167
|
-
kind = "
|
|
9195
|
+
kind = "azure";
|
|
9168
9196
|
targetName;
|
|
9169
|
-
|
|
9197
|
+
piModel;
|
|
9170
9198
|
defaults;
|
|
9171
9199
|
retryConfig;
|
|
9200
|
+
apiKey;
|
|
9201
|
+
providerOptions;
|
|
9202
|
+
constructor(targetName, config) {
|
|
9203
|
+
this.id = `azure:${targetName}`;
|
|
9204
|
+
this.targetName = targetName;
|
|
9205
|
+
this.apiKey = config.apiKey;
|
|
9206
|
+
this.defaults = {
|
|
9207
|
+
temperature: config.temperature,
|
|
9208
|
+
maxOutputTokens: config.maxOutputTokens
|
|
9209
|
+
};
|
|
9210
|
+
this.retryConfig = config.retry;
|
|
9211
|
+
const trimmed = config.resourceName.trim();
|
|
9212
|
+
const isFullUrl = /^https?:\/\//i.test(trimmed);
|
|
9213
|
+
const baseUrl = isFullUrl ? buildAzureBaseUrl(trimmed) : void 0;
|
|
9214
|
+
this.providerOptions = {
|
|
9215
|
+
...baseUrl ? { azureBaseUrl: baseUrl } : { azureResourceName: trimmed },
|
|
9216
|
+
...config.version ? { azureApiVersion: config.version } : {}
|
|
9217
|
+
};
|
|
9218
|
+
this.piModel = resolvePiModel({
|
|
9219
|
+
providerName: "azure-openai-responses",
|
|
9220
|
+
apiId: "azure-openai-responses",
|
|
9221
|
+
// The "model id" for Azure is the deployment name.
|
|
9222
|
+
modelId: config.deploymentName,
|
|
9223
|
+
...baseUrl ? { baseUrl } : {}
|
|
9224
|
+
});
|
|
9225
|
+
}
|
|
9172
9226
|
async invoke(request) {
|
|
9173
|
-
return
|
|
9174
|
-
model: this.
|
|
9227
|
+
return invokePiAi({
|
|
9228
|
+
model: this.piModel,
|
|
9229
|
+
apiKey: this.apiKey,
|
|
9175
9230
|
request,
|
|
9176
9231
|
defaults: this.defaults,
|
|
9177
|
-
retryConfig: this.retryConfig
|
|
9232
|
+
retryConfig: this.retryConfig,
|
|
9233
|
+
providerOptions: this.providerOptions
|
|
9178
9234
|
});
|
|
9179
9235
|
}
|
|
9180
|
-
|
|
9181
|
-
|
|
9236
|
+
};
|
|
9237
|
+
}
|
|
9238
|
+
});
|
|
9239
|
+
|
|
9240
|
+
// src/evaluation/providers/agentv-provider.ts
|
|
9241
|
+
var agentv_provider_exports = {};
|
|
9242
|
+
__export(agentv_provider_exports, {
|
|
9243
|
+
AgentvProvider: () => AgentvProvider
|
|
9244
|
+
});
|
|
9245
|
+
function parseAgentvModel(model) {
|
|
9246
|
+
const colonIndex = model.indexOf(":");
|
|
9247
|
+
if (colonIndex === -1) {
|
|
9248
|
+
throw new Error(
|
|
9249
|
+
`Invalid agentv model "${model}". Expected "provider:model" (e.g., "openai:gpt-5-mini").`
|
|
9250
|
+
);
|
|
9251
|
+
}
|
|
9252
|
+
const provider = model.slice(0, colonIndex);
|
|
9253
|
+
const modelId = model.slice(colonIndex + 1);
|
|
9254
|
+
switch (provider) {
|
|
9255
|
+
case "openai":
|
|
9256
|
+
return { providerName: "openai", apiId: "openai-completions", modelId };
|
|
9257
|
+
case "anthropic":
|
|
9258
|
+
return { providerName: "anthropic", apiId: "anthropic-messages", modelId };
|
|
9259
|
+
case "azure":
|
|
9260
|
+
return {
|
|
9261
|
+
providerName: "azure-openai-responses",
|
|
9262
|
+
apiId: "azure-openai-responses",
|
|
9263
|
+
modelId
|
|
9264
|
+
};
|
|
9265
|
+
case "google":
|
|
9266
|
+
return { providerName: "google", apiId: "google-generative-ai", modelId };
|
|
9267
|
+
default:
|
|
9268
|
+
throw new Error(
|
|
9269
|
+
`Unsupported agentv provider "${provider}" in "${model}". Supported: openai, anthropic, azure, google.`
|
|
9270
|
+
);
|
|
9271
|
+
}
|
|
9272
|
+
}
|
|
9273
|
+
var AgentvProvider;
|
|
9274
|
+
var init_agentv_provider = __esm({
|
|
9275
|
+
"src/evaluation/providers/agentv-provider.ts"() {
|
|
9276
|
+
"use strict";
|
|
9277
|
+
init_cjs_shims();
|
|
9278
|
+
init_llm_providers();
|
|
9279
|
+
AgentvProvider = class {
|
|
9280
|
+
id;
|
|
9281
|
+
kind = "agentv";
|
|
9282
|
+
targetName;
|
|
9283
|
+
piModel;
|
|
9284
|
+
defaults;
|
|
9285
|
+
constructor(targetName, config) {
|
|
9286
|
+
this.id = `agentv:${targetName}`;
|
|
9287
|
+
this.targetName = targetName;
|
|
9288
|
+
const { providerName, apiId, modelId } = parseAgentvModel(config.model);
|
|
9289
|
+
this.piModel = resolvePiModel({ providerName, apiId, modelId });
|
|
9290
|
+
this.defaults = { temperature: config.temperature };
|
|
9291
|
+
}
|
|
9292
|
+
async invoke(request) {
|
|
9293
|
+
return invokePiAi({
|
|
9294
|
+
model: this.piModel,
|
|
9295
|
+
request,
|
|
9296
|
+
defaults: this.defaults
|
|
9297
|
+
});
|
|
9182
9298
|
}
|
|
9183
9299
|
};
|
|
9184
9300
|
}
|
|
@@ -13381,10 +13497,10 @@ function extractToolCallsFromEvents(events) {
|
|
|
13381
13497
|
}
|
|
13382
13498
|
}
|
|
13383
13499
|
const toolCalls = [];
|
|
13384
|
-
for (const [id, { tool
|
|
13500
|
+
for (const [id, { tool, input }] of starts) {
|
|
13385
13501
|
toolCalls.push(
|
|
13386
13502
|
normalizeToolCall("pi-cli", {
|
|
13387
|
-
tool
|
|
13503
|
+
tool,
|
|
13388
13504
|
input,
|
|
13389
13505
|
id: id.startsWith("anon-") ? void 0 : id,
|
|
13390
13506
|
output: results.get(id)
|
|
@@ -17765,7 +17881,6 @@ var init_providers = __esm({
|
|
|
17765
17881
|
"use strict";
|
|
17766
17882
|
init_cjs_shims();
|
|
17767
17883
|
init_agentv_provider();
|
|
17768
|
-
init_ai_sdk();
|
|
17769
17884
|
init_claude_cli();
|
|
17770
17885
|
init_claude_sdk();
|
|
17771
17886
|
init_cli();
|
|
@@ -17773,6 +17888,7 @@ var init_providers = __esm({
|
|
|
17773
17888
|
init_copilot_cli();
|
|
17774
17889
|
init_copilot_log();
|
|
17775
17890
|
init_copilot_sdk();
|
|
17891
|
+
init_llm_providers();
|
|
17776
17892
|
init_mock();
|
|
17777
17893
|
init_pi_cli();
|
|
17778
17894
|
init_pi_coding_agent();
|
|
@@ -19799,6 +19915,19 @@ async function runEvaluation(options) {
|
|
|
19799
19915
|
await dockerSetup.pullImage();
|
|
19800
19916
|
setupLog("Docker image pull complete");
|
|
19801
19917
|
}
|
|
19918
|
+
if (suiteWorkspace?.env) {
|
|
19919
|
+
try {
|
|
19920
|
+
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
19921
|
+
setupLog("preflight checks passed");
|
|
19922
|
+
} catch (error) {
|
|
19923
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
19924
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
19925
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
19926
|
+
});
|
|
19927
|
+
}
|
|
19928
|
+
throw new Error(message);
|
|
19929
|
+
}
|
|
19930
|
+
}
|
|
19802
19931
|
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
19803
19932
|
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
19804
19933
|
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
@@ -22069,6 +22198,38 @@ function computeWeightedMean(entries) {
|
|
|
22069
22198
|
}
|
|
22070
22199
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
22071
22200
|
}
|
|
22201
|
+
async function runPreflightChecks(env, cwd, log) {
|
|
22202
|
+
const execFileAsync5 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
|
|
22203
|
+
const missing = [];
|
|
22204
|
+
for (const cmd of env.required_commands ?? []) {
|
|
22205
|
+
log(`preflight: checking command "${cmd}"`);
|
|
22206
|
+
try {
|
|
22207
|
+
if (process.platform === "win32") {
|
|
22208
|
+
await execFileAsync5("where", [cmd], { cwd });
|
|
22209
|
+
} else {
|
|
22210
|
+
await execFileAsync5("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
22211
|
+
}
|
|
22212
|
+
} catch {
|
|
22213
|
+
missing.push(`command: ${cmd}`);
|
|
22214
|
+
}
|
|
22215
|
+
}
|
|
22216
|
+
for (const mod of env.required_python_modules ?? []) {
|
|
22217
|
+
log(`preflight: checking Python module "${mod}"`);
|
|
22218
|
+
try {
|
|
22219
|
+
await execFileAsync5("python3", ["-c", `import ${mod}`], { cwd });
|
|
22220
|
+
} catch {
|
|
22221
|
+
missing.push(`python module: ${mod}`);
|
|
22222
|
+
}
|
|
22223
|
+
}
|
|
22224
|
+
if (missing.length > 0) {
|
|
22225
|
+
throw new Error(
|
|
22226
|
+
`Preflight checks failed \u2014 missing dependencies:
|
|
22227
|
+
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
22228
|
+
|
|
22229
|
+
Install the missing dependencies before running this eval.`
|
|
22230
|
+
);
|
|
22231
|
+
}
|
|
22232
|
+
}
|
|
22072
22233
|
var import_node_child_process11, import_node_crypto11, import_node_fs16, import_promises35, import_node_path47, import_node_util7, import_micromatch2, execFileAsync3, WORKSPACE_GIT_TIMEOUT_MS;
|
|
22073
22234
|
var init_orchestrator = __esm({
|
|
22074
22235
|
"src/evaluation/orchestrator.ts"() {
|
|
@@ -22931,7 +23092,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
22931
23092
|
const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
|
|
22932
23093
|
const mode = explicitMode ?? (workspacePath ? "static" : void 0);
|
|
22933
23094
|
const docker = parseDockerWorkspaceConfig(obj.docker);
|
|
22934
|
-
|
|
23095
|
+
const env = parseWorkspaceEnvConfig(obj.env);
|
|
23096
|
+
if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
|
|
22935
23097
|
return void 0;
|
|
22936
23098
|
return {
|
|
22937
23099
|
...template !== void 0 && { template },
|
|
@@ -22940,7 +23102,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
22940
23102
|
...hooks !== void 0 && { hooks },
|
|
22941
23103
|
...mode !== void 0 && { mode },
|
|
22942
23104
|
...workspacePath !== void 0 && { path: workspacePath },
|
|
22943
|
-
...docker !== void 0 && { docker }
|
|
23105
|
+
...docker !== void 0 && { docker },
|
|
23106
|
+
...env !== void 0 && { env }
|
|
23107
|
+
};
|
|
23108
|
+
}
|
|
23109
|
+
function parseWorkspaceEnvConfig(raw) {
|
|
23110
|
+
if (!isJsonObject(raw)) return void 0;
|
|
23111
|
+
const obj = raw;
|
|
23112
|
+
const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
|
|
23113
|
+
const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
|
|
23114
|
+
if (!required_commands?.length && !required_python_modules?.length) return void 0;
|
|
23115
|
+
return {
|
|
23116
|
+
...required_commands?.length && { required_commands },
|
|
23117
|
+
...required_python_modules?.length && { required_python_modules }
|
|
22944
23118
|
};
|
|
22945
23119
|
}
|
|
22946
23120
|
function parseDockerWorkspaceConfig(raw) {
|
|
@@ -24966,8 +25140,8 @@ init_cjs_shims();
|
|
|
24966
25140
|
|
|
24967
25141
|
// src/evaluation/generators/rubric-generator.ts
|
|
24968
25142
|
init_cjs_shims();
|
|
24969
|
-
var import_ai4 = require("ai");
|
|
24970
25143
|
var import_zod6 = require("zod");
|
|
25144
|
+
init_types2();
|
|
24971
25145
|
var rubricItemSchema = import_zod6.z.object({
|
|
24972
25146
|
id: import_zod6.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
24973
25147
|
outcome: import_zod6.z.string().describe("Concrete expected outcome for this rubric item"),
|
|
@@ -24980,10 +25154,6 @@ var rubricGenerationSchema = import_zod6.z.object({
|
|
|
24980
25154
|
async function generateRubrics(options) {
|
|
24981
25155
|
const { criteria, question, referenceAnswer, provider } = options;
|
|
24982
25156
|
const prompt = buildPrompt(criteria, question, referenceAnswer);
|
|
24983
|
-
const model = provider.asLanguageModel?.();
|
|
24984
|
-
if (!model) {
|
|
24985
|
-
throw new Error("Provider does not support language model interface");
|
|
24986
|
-
}
|
|
24987
25157
|
const system = `You are an expert at creating evaluation rubrics.
|
|
24988
25158
|
You must return a valid JSON object matching this schema:
|
|
24989
25159
|
{
|
|
@@ -25000,11 +25170,11 @@ You must return a valid JSON object matching this schema:
|
|
|
25000
25170
|
let lastError;
|
|
25001
25171
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
25002
25172
|
try {
|
|
25003
|
-
const
|
|
25004
|
-
|
|
25005
|
-
system
|
|
25006
|
-
prompt
|
|
25173
|
+
const response = await provider.invoke({
|
|
25174
|
+
question: prompt,
|
|
25175
|
+
systemPrompt: system
|
|
25007
25176
|
});
|
|
25177
|
+
const text = extractLastAssistantContent2(response.output);
|
|
25008
25178
|
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
25009
25179
|
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
25010
25180
|
break;
|