@agentv/core 4.25.1 → 4.25.3-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-MUIGGIP3.js +7 -0
- package/dist/chunk-5XV3FAAD.js +616 -0
- package/dist/chunk-5XV3FAAD.js.map +1 -0
- package/dist/{chunk-6HLBKYE2.js → chunk-CALQDF2Y.js} +1 -1
- package/dist/chunk-CALQDF2Y.js.map +1 -0
- package/dist/{chunk-IXTJEXWN.js → chunk-EVEZQXIS.js} +187 -551
- package/dist/chunk-EVEZQXIS.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +591 -419
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.js +7 -12
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-E6MROJGR.js +12 -0
- package/package.json +2 -7
- package/dist/agentv-provider-TXM4UEUT.js +0 -7
- package/dist/chunk-6HLBKYE2.js.map +0 -1
- package/dist/chunk-IXTJEXWN.js.map +0 -1
- package/dist/chunk-PRNXHNLF.js +0 -65
- package/dist/chunk-PRNXHNLF.js.map +0 -1
- package/dist/ts-eval-loader-4CFPGHGT.js +0 -12
- /package/dist/{agentv-provider-TXM4UEUT.js.map → agentv-provider-MUIGGIP3.js.map} +0 -0
- /package/dist/{ts-eval-loader-4CFPGHGT.js.map → ts-eval-loader-E6MROJGR.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -223,7 +223,7 @@ function computeTraceSummary(messages) {
|
|
|
223
223
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
224
224
|
if (summary.eventCount === 0) return void 0;
|
|
225
225
|
const explorationCalls = explorationTools.reduce(
|
|
226
|
-
(sum,
|
|
226
|
+
(sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
|
|
227
227
|
0
|
|
228
228
|
);
|
|
229
229
|
return explorationCalls / summary.eventCount;
|
|
@@ -5187,8 +5187,17 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
5187
5187
|
}
|
|
5188
5188
|
return result;
|
|
5189
5189
|
}
|
|
5190
|
+
async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5191
|
+
return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
5192
|
+
}
|
|
5190
5193
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
5191
|
-
const { stdout, stderr, exitCode } =
|
|
5194
|
+
const { stdout, stderr, exitCode } = await runScriptRaw(
|
|
5195
|
+
scriptPath,
|
|
5196
|
+
input,
|
|
5197
|
+
agentTimeoutMs,
|
|
5198
|
+
cwd,
|
|
5199
|
+
env
|
|
5200
|
+
);
|
|
5192
5201
|
if (exitCode !== 0) {
|
|
5193
5202
|
const trimmedErr = formatStderr(stderr);
|
|
5194
5203
|
throw new Error(
|
|
@@ -5306,6 +5315,8 @@ var init_code_grader = __esm({
|
|
|
5306
5315
|
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
|
|
5307
5316
|
try {
|
|
5308
5317
|
let stdout;
|
|
5318
|
+
let exitCode = 0;
|
|
5319
|
+
let execStderr = "";
|
|
5309
5320
|
if (context2.dockerConfig) {
|
|
5310
5321
|
const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
|
|
5311
5322
|
const dockerProvider = new DockerWorkspaceProvider2(context2.dockerConfig);
|
|
@@ -5314,31 +5325,42 @@ var init_code_grader = __esm({
|
|
|
5314
5325
|
stdin: inputPayload,
|
|
5315
5326
|
repoCheckouts: getRepoCheckoutTargets(context2.evalCase.workspace?.repos)
|
|
5316
5327
|
});
|
|
5317
|
-
|
|
5318
|
-
const trimmedErr = result.stderr.trim();
|
|
5319
|
-
throw new Error(
|
|
5320
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
|
|
5321
|
-
);
|
|
5322
|
-
}
|
|
5328
|
+
exitCode = result.exitCode;
|
|
5323
5329
|
stdout = result.stdout.trim();
|
|
5330
|
+
execStderr = result.stderr;
|
|
5324
5331
|
} else {
|
|
5325
|
-
|
|
5332
|
+
const result = await runScriptRaw(
|
|
5326
5333
|
this.command,
|
|
5327
5334
|
inputPayload,
|
|
5328
5335
|
this.agentTimeoutMs,
|
|
5329
5336
|
this.cwd,
|
|
5330
5337
|
env
|
|
5331
5338
|
);
|
|
5339
|
+
exitCode = result.exitCode;
|
|
5340
|
+
stdout = result.stdout.trim();
|
|
5341
|
+
execStderr = result.stderr;
|
|
5332
5342
|
}
|
|
5333
|
-
const
|
|
5334
|
-
const
|
|
5335
|
-
|
|
5343
|
+
const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
|
|
5344
|
+
const hasStderr = execStderr.trim().length > 0;
|
|
5345
|
+
if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
|
|
5346
|
+
const trimmedErr = formatStderr(execStderr);
|
|
5347
|
+
throw new Error(
|
|
5348
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5349
|
+
);
|
|
5350
|
+
}
|
|
5351
|
+
const rawParsed = parseJsonSafe(stdout);
|
|
5352
|
+
const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
|
|
5353
|
+
const passed = exitCode === 0;
|
|
5354
|
+
const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
5336
5355
|
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
5337
5356
|
).map((a) => ({
|
|
5338
5357
|
text: String(a.text),
|
|
5339
5358
|
passed: Boolean(a.passed),
|
|
5340
5359
|
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
5341
|
-
})) : [];
|
|
5360
|
+
})) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
|
|
5361
|
+
const score = parsed != null ? clampScore(
|
|
5362
|
+
typeof parsed.score === "number" ? parsed.score : assertions.length > 0 ? assertions.filter((a) => a.passed).length / assertions.length : 0
|
|
5363
|
+
) : passed ? 1 : 0;
|
|
5342
5364
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
5343
5365
|
const proxyUsage = getProxyUsage?.();
|
|
5344
5366
|
const graderRawRequest = {
|
|
@@ -5646,13 +5668,6 @@ function extractImageBlocks(messages) {
|
|
|
5646
5668
|
}
|
|
5647
5669
|
return images;
|
|
5648
5670
|
}
|
|
5649
|
-
function toAiSdkImageParts(images) {
|
|
5650
|
-
return images.map((img) => ({
|
|
5651
|
-
type: "image",
|
|
5652
|
-
image: img.source,
|
|
5653
|
-
mediaType: img.media_type || void 0
|
|
5654
|
-
}));
|
|
5655
|
-
}
|
|
5656
5671
|
function resolveSandboxed(basePath, relativePath) {
|
|
5657
5672
|
const resolved = import_node_path12.default.resolve(basePath, relativePath);
|
|
5658
5673
|
if (!resolved.startsWith(basePath + import_node_path12.default.sep) && resolved !== basePath) {
|
|
@@ -5661,15 +5676,24 @@ function resolveSandboxed(basePath, relativePath) {
|
|
|
5661
5676
|
return resolved;
|
|
5662
5677
|
}
|
|
5663
5678
|
function createFilesystemTools(workspacePath) {
|
|
5664
|
-
return
|
|
5665
|
-
|
|
5679
|
+
return [
|
|
5680
|
+
{
|
|
5681
|
+
name: "list_files",
|
|
5666
5682
|
description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
|
|
5667
|
-
|
|
5668
|
-
|
|
5669
|
-
|
|
5683
|
+
parameters: {
|
|
5684
|
+
type: "object",
|
|
5685
|
+
properties: {
|
|
5686
|
+
path: {
|
|
5687
|
+
type: "string",
|
|
5688
|
+
description: 'Relative path within workspace (use "." for root)',
|
|
5689
|
+
default: "."
|
|
5690
|
+
}
|
|
5691
|
+
}
|
|
5692
|
+
},
|
|
5670
5693
|
execute: async (input) => {
|
|
5694
|
+
const args = input ?? {};
|
|
5671
5695
|
try {
|
|
5672
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5696
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
5673
5697
|
const entries = await import_promises12.default.readdir(resolved, { withFileTypes: true });
|
|
5674
5698
|
return entries.map((e) => ({
|
|
5675
5699
|
name: e.name,
|
|
@@ -5679,18 +5703,25 @@ function createFilesystemTools(workspacePath) {
|
|
|
5679
5703
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5680
5704
|
}
|
|
5681
5705
|
}
|
|
5682
|
-
}
|
|
5683
|
-
|
|
5706
|
+
},
|
|
5707
|
+
{
|
|
5708
|
+
name: "read_file",
|
|
5684
5709
|
description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
|
|
5685
|
-
|
|
5686
|
-
|
|
5687
|
-
|
|
5710
|
+
parameters: {
|
|
5711
|
+
type: "object",
|
|
5712
|
+
properties: {
|
|
5713
|
+
path: { type: "string", description: "Relative path to file within workspace" }
|
|
5714
|
+
},
|
|
5715
|
+
required: ["path"]
|
|
5716
|
+
},
|
|
5688
5717
|
execute: async (input) => {
|
|
5718
|
+
const args = input ?? {};
|
|
5719
|
+
const relPath = args.path ?? "";
|
|
5689
5720
|
try {
|
|
5690
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5721
|
+
const resolved = resolveSandboxed(workspacePath, relPath);
|
|
5691
5722
|
const stat14 = await import_promises12.default.stat(resolved);
|
|
5692
5723
|
if (stat14.isDirectory()) {
|
|
5693
|
-
return { error: `'${
|
|
5724
|
+
return { error: `'${relPath}' is a directory, not a file` };
|
|
5694
5725
|
}
|
|
5695
5726
|
const buffer = Buffer.alloc(Math.min(stat14.size, MAX_FILE_SIZE));
|
|
5696
5727
|
const fd = await import_promises12.default.open(resolved, "r");
|
|
@@ -5706,19 +5737,29 @@ function createFilesystemTools(workspacePath) {
|
|
|
5706
5737
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5707
5738
|
}
|
|
5708
5739
|
}
|
|
5709
|
-
}
|
|
5710
|
-
|
|
5740
|
+
},
|
|
5741
|
+
{
|
|
5742
|
+
name: "search_files",
|
|
5711
5743
|
description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
|
|
5712
|
-
|
|
5713
|
-
|
|
5714
|
-
|
|
5715
|
-
|
|
5744
|
+
parameters: {
|
|
5745
|
+
type: "object",
|
|
5746
|
+
properties: {
|
|
5747
|
+
pattern: { type: "string", description: "Regex pattern to search for" },
|
|
5748
|
+
path: {
|
|
5749
|
+
type: "string",
|
|
5750
|
+
description: 'Relative path to search within (use "." for root)',
|
|
5751
|
+
default: "."
|
|
5752
|
+
}
|
|
5753
|
+
},
|
|
5754
|
+
required: ["pattern"]
|
|
5755
|
+
},
|
|
5716
5756
|
execute: async (input) => {
|
|
5757
|
+
const args = input ?? {};
|
|
5717
5758
|
try {
|
|
5718
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
5759
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
5719
5760
|
let regex;
|
|
5720
5761
|
try {
|
|
5721
|
-
regex = new RegExp(
|
|
5762
|
+
regex = new RegExp(args.pattern ?? "", "gi");
|
|
5722
5763
|
} catch (regexErr) {
|
|
5723
5764
|
return {
|
|
5724
5765
|
error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
|
|
@@ -5731,8 +5772,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
5731
5772
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
5732
5773
|
}
|
|
5733
5774
|
}
|
|
5734
|
-
}
|
|
5735
|
-
|
|
5775
|
+
}
|
|
5776
|
+
];
|
|
5736
5777
|
}
|
|
5737
5778
|
async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
5738
5779
|
if (matches.length >= MAX_SEARCH_MATCHES) return;
|
|
@@ -5772,14 +5813,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
5772
5813
|
}
|
|
5773
5814
|
}
|
|
5774
5815
|
}
|
|
5775
|
-
var import_promises12, import_node_path12,
|
|
5816
|
+
var import_promises12, import_node_path12, import_zod2, DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT, MAX_FILE_SIZE, MAX_SEARCH_MATCHES, SEARCH_SKIP_DIRS, BINARY_EXTENSIONS, DEFAULT_GRADER_TEMPLATE, freeformEvaluationSchema, rubricCheckResultSchema, rubricEvaluationSchema, scoreRangeCheckResultSchema, scoreRangeEvaluationSchema, LlmGrader, ANSI_YELLOW7, ANSI_RESET8, warnedTemplateStrings;
|
|
5776
5817
|
var init_llm_grader = __esm({
|
|
5777
5818
|
"src/evaluation/graders/llm-grader.ts"() {
|
|
5778
5819
|
"use strict";
|
|
5779
5820
|
init_cjs_shims();
|
|
5780
5821
|
import_promises12 = __toESM(require("fs/promises"), 1);
|
|
5781
5822
|
import_node_path12 = __toESM(require("path"), 1);
|
|
5782
|
-
import_ai = require("ai");
|
|
5783
5823
|
import_zod2 = require("zod");
|
|
5784
5824
|
init_content_preprocessor();
|
|
5785
5825
|
init_content();
|
|
@@ -6095,18 +6135,15 @@ ${context2.toolCalls}`;
|
|
|
6095
6135
|
}
|
|
6096
6136
|
}
|
|
6097
6137
|
// ---------------------------------------------------------------------------
|
|
6098
|
-
// Built-in agent mode (agentv provider —
|
|
6138
|
+
// Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
|
|
6099
6139
|
// ---------------------------------------------------------------------------
|
|
6100
6140
|
/**
|
|
6101
|
-
* Built-in mode:
|
|
6141
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
6142
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
6143
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
6144
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
6102
6145
|
*/
|
|
6103
6146
|
async evaluateBuiltIn(context2, graderProvider) {
|
|
6104
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6105
|
-
if (!model) {
|
|
6106
|
-
throw new Error(
|
|
6107
|
-
`Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
|
|
6108
|
-
);
|
|
6109
|
-
}
|
|
6110
6147
|
const workspacePath = context2.workspacePath;
|
|
6111
6148
|
if (!workspacePath) {
|
|
6112
6149
|
throw new Error(
|
|
@@ -6125,18 +6162,21 @@ ${context2.toolCalls}`;
|
|
|
6125
6162
|
maxSteps: this.maxSteps
|
|
6126
6163
|
};
|
|
6127
6164
|
try {
|
|
6128
|
-
const
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6165
|
+
const response = await graderProvider.invoke({
|
|
6166
|
+
question: userPrompt,
|
|
6167
|
+
systemPrompt,
|
|
6168
|
+
evalCaseId: context2.evalCase.id,
|
|
6169
|
+
attempt: context2.attempt,
|
|
6170
|
+
temperature: this.temperature ?? 0,
|
|
6132
6171
|
tools: fsTools,
|
|
6133
|
-
|
|
6134
|
-
temperature: this.temperature ?? 0
|
|
6172
|
+
maxSteps: this.maxSteps
|
|
6135
6173
|
});
|
|
6136
|
-
const
|
|
6174
|
+
const text = extractLastAssistantContent2(response.output);
|
|
6175
|
+
const stepCount = response.steps?.count ?? 1;
|
|
6176
|
+
const toolCallCount = response.steps?.toolCallCount ?? 0;
|
|
6137
6177
|
const details = {
|
|
6138
6178
|
mode: "built-in",
|
|
6139
|
-
steps:
|
|
6179
|
+
steps: stepCount,
|
|
6140
6180
|
tool_calls: toolCallCount
|
|
6141
6181
|
};
|
|
6142
6182
|
return this.parseAgentResult(
|
|
@@ -6588,43 +6628,14 @@ ${outputSchema}`;
|
|
|
6588
6628
|
}
|
|
6589
6629
|
async generateStructuredResponse(options) {
|
|
6590
6630
|
const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
6591
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6592
|
-
if (model) {
|
|
6593
|
-
const modelOptions = {
|
|
6594
|
-
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
6595
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
6596
|
-
};
|
|
6597
|
-
const hasImages = images && images.length > 0;
|
|
6598
|
-
const result = hasImages ? await (0, import_ai.generateText)({
|
|
6599
|
-
model,
|
|
6600
|
-
system: systemPrompt,
|
|
6601
|
-
messages: [
|
|
6602
|
-
{
|
|
6603
|
-
role: "user",
|
|
6604
|
-
content: [
|
|
6605
|
-
{ type: "text", text: userPrompt },
|
|
6606
|
-
...toAiSdkImageParts(images)
|
|
6607
|
-
]
|
|
6608
|
-
}
|
|
6609
|
-
],
|
|
6610
|
-
...modelOptions
|
|
6611
|
-
}) : await (0, import_ai.generateText)({
|
|
6612
|
-
model,
|
|
6613
|
-
system: systemPrompt,
|
|
6614
|
-
prompt: userPrompt,
|
|
6615
|
-
...modelOptions
|
|
6616
|
-
});
|
|
6617
|
-
const rawUsage = result.usage;
|
|
6618
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
6619
|
-
return { text: result.text, tokenUsage };
|
|
6620
|
-
}
|
|
6621
6631
|
const response = await graderProvider.invoke({
|
|
6622
6632
|
question: userPrompt,
|
|
6623
6633
|
systemPrompt,
|
|
6624
6634
|
evalCaseId: context2.evalCase.id,
|
|
6625
6635
|
attempt: context2.attempt,
|
|
6626
6636
|
maxOutputTokens: this.maxOutputTokens,
|
|
6627
|
-
temperature: this.temperature
|
|
6637
|
+
temperature: this.temperature,
|
|
6638
|
+
...images && images.length > 0 ? { images } : {}
|
|
6628
6639
|
});
|
|
6629
6640
|
return {
|
|
6630
6641
|
text: extractLastAssistantContent2(response.output),
|
|
@@ -6640,12 +6651,11 @@ ${outputSchema}`;
|
|
|
6640
6651
|
});
|
|
6641
6652
|
|
|
6642
6653
|
// src/evaluation/graders/composite.ts
|
|
6643
|
-
var
|
|
6654
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT, CompositeGrader;
|
|
6644
6655
|
var init_composite = __esm({
|
|
6645
6656
|
"src/evaluation/graders/composite.ts"() {
|
|
6646
6657
|
"use strict";
|
|
6647
6658
|
init_cjs_shims();
|
|
6648
|
-
import_ai2 = require("ai");
|
|
6649
6659
|
init_types2();
|
|
6650
6660
|
init_code_grader();
|
|
6651
6661
|
init_llm_grader();
|
|
@@ -6888,25 +6898,6 @@ Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`
|
|
|
6888
6898
|
target: graderProvider.targetName
|
|
6889
6899
|
};
|
|
6890
6900
|
try {
|
|
6891
|
-
const model = graderProvider.asLanguageModel?.();
|
|
6892
|
-
if (model) {
|
|
6893
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
6894
|
-
model,
|
|
6895
|
-
system: systemPrompt,
|
|
6896
|
-
prompt: userPrompt
|
|
6897
|
-
});
|
|
6898
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
6899
|
-
const score2 = clampScore(data2.score);
|
|
6900
|
-
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
6901
|
-
return {
|
|
6902
|
-
score: score2,
|
|
6903
|
-
verdict: scoreToVerdict(score2),
|
|
6904
|
-
assertions: assertions2,
|
|
6905
|
-
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
6906
|
-
graderRawRequest,
|
|
6907
|
-
scores
|
|
6908
|
-
};
|
|
6909
|
-
}
|
|
6910
6901
|
const response = await graderProvider.invoke({
|
|
6911
6902
|
question: userPrompt,
|
|
6912
6903
|
systemPrompt,
|
|
@@ -8689,115 +8680,254 @@ var init_graders2 = __esm({
|
|
|
8689
8680
|
}
|
|
8690
8681
|
});
|
|
8691
8682
|
|
|
8692
|
-
// src/evaluation/providers/
|
|
8693
|
-
|
|
8694
|
-
|
|
8695
|
-
|
|
8696
|
-
}
|
|
8697
|
-
|
|
8698
|
-
|
|
8699
|
-
|
|
8700
|
-
|
|
8701
|
-
|
|
8683
|
+
// src/evaluation/providers/llm-providers.ts
|
|
8684
|
+
function buildAzureBaseUrl(input) {
|
|
8685
|
+
const trimmed = input.replace(/\/+$/, "");
|
|
8686
|
+
if (trimmed.endsWith("/openai/v1")) return trimmed;
|
|
8687
|
+
if (trimmed.endsWith("/openai")) return `${trimmed}/v1`;
|
|
8688
|
+
return `${trimmed}/openai/v1`;
|
|
8689
|
+
}
|
|
8690
|
+
async function invokePiAi(options) {
|
|
8691
|
+
const { model, apiKey, request, defaults, retryConfig, providerOptions } = options;
|
|
8692
|
+
const tools = request.tools && request.tools.length > 0 ? request.tools : void 0;
|
|
8693
|
+
const maxSteps = tools ? Math.max(1, request.maxSteps ?? 1) : 1;
|
|
8694
|
+
const { systemPrompt, messages } = chatPromptToPiContext(buildChatPrompt(request));
|
|
8695
|
+
if (request.images && request.images.length > 0) {
|
|
8696
|
+
attachImagesToLastUserMessage(messages, request.images);
|
|
8697
|
+
}
|
|
8698
|
+
const piTools = tools ? tools.map((t) => ({
|
|
8699
|
+
name: t.name,
|
|
8700
|
+
description: t.description,
|
|
8701
|
+
parameters: t.parameters
|
|
8702
|
+
})) : void 0;
|
|
8703
|
+
const ctx = { systemPrompt, messages, ...piTools ? { tools: piTools } : {} };
|
|
8704
|
+
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
8705
|
+
const callOptions = {
|
|
8706
|
+
...apiKey !== void 0 ? { apiKey } : {},
|
|
8707
|
+
temperature,
|
|
8708
|
+
...maxOutputTokens !== void 0 ? { maxTokens: maxOutputTokens } : {},
|
|
8709
|
+
signal: request.signal,
|
|
8710
|
+
...providerOptions ?? {}
|
|
8711
|
+
};
|
|
8712
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8713
|
+
const startMs = Date.now();
|
|
8714
|
+
const aggregateUsage = { input: 0, output: 0, cacheRead: 0, cost: 0 };
|
|
8715
|
+
let stepCount = 0;
|
|
8716
|
+
let toolCallCount = 0;
|
|
8717
|
+
let result = await withRetry(
|
|
8718
|
+
() => (0, import_pi_ai.complete)(model, ctx, callOptions),
|
|
8719
|
+
retryConfig,
|
|
8720
|
+
request.signal
|
|
8721
|
+
);
|
|
8722
|
+
ctx.messages.push(result);
|
|
8723
|
+
stepCount = 1;
|
|
8724
|
+
accumulateUsage(aggregateUsage, result.usage);
|
|
8725
|
+
while (tools) {
|
|
8726
|
+
const calls = result.content.filter(
|
|
8727
|
+
(b) => b.type === "toolCall"
|
|
8728
|
+
);
|
|
8729
|
+
if (calls.length === 0) break;
|
|
8730
|
+
if (stepCount >= maxSteps) break;
|
|
8731
|
+
toolCallCount += calls.length;
|
|
8732
|
+
for (const call of calls) {
|
|
8733
|
+
const tool = tools.find((t) => t.name === call.name);
|
|
8734
|
+
let output;
|
|
8735
|
+
let isError = false;
|
|
8736
|
+
try {
|
|
8737
|
+
if (!tool) {
|
|
8738
|
+
throw new Error(`pi-ai adapter: model called unknown tool '${call.name}'`);
|
|
8739
|
+
}
|
|
8740
|
+
output = await tool.execute(call.arguments);
|
|
8741
|
+
} catch (err) {
|
|
8742
|
+
output = err instanceof Error ? err.message : String(err);
|
|
8743
|
+
isError = true;
|
|
8744
|
+
}
|
|
8745
|
+
ctx.messages.push({
|
|
8746
|
+
role: "toolResult",
|
|
8747
|
+
toolCallId: call.id,
|
|
8748
|
+
toolName: call.name,
|
|
8749
|
+
content: [
|
|
8750
|
+
{ type: "text", text: typeof output === "string" ? output : JSON.stringify(output) }
|
|
8751
|
+
],
|
|
8752
|
+
isError,
|
|
8753
|
+
timestamp: Date.now()
|
|
8754
|
+
});
|
|
8755
|
+
}
|
|
8756
|
+
result = await withRetry(
|
|
8757
|
+
() => (0, import_pi_ai.complete)(model, ctx, callOptions),
|
|
8758
|
+
retryConfig,
|
|
8759
|
+
request.signal
|
|
8702
8760
|
);
|
|
8761
|
+
ctx.messages.push(result);
|
|
8762
|
+
stepCount += 1;
|
|
8763
|
+
accumulateUsage(aggregateUsage, result.usage);
|
|
8703
8764
|
}
|
|
8704
|
-
|
|
8705
|
-
|
|
8706
|
-
|
|
8707
|
-
|
|
8765
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8766
|
+
const durationMs = Date.now() - startMs;
|
|
8767
|
+
return mapPiResponse(result, {
|
|
8768
|
+
durationMs,
|
|
8769
|
+
startTime,
|
|
8770
|
+
endTime,
|
|
8771
|
+
aggregateUsage,
|
|
8772
|
+
steps: tools ? { count: stepCount, toolCallCount } : void 0
|
|
8773
|
+
});
|
|
8708
8774
|
}
|
|
8709
|
-
function
|
|
8710
|
-
|
|
8711
|
-
|
|
8712
|
-
|
|
8713
|
-
|
|
8714
|
-
|
|
8715
|
-
|
|
8716
|
-
|
|
8717
|
-
|
|
8718
|
-
|
|
8719
|
-
|
|
8720
|
-
|
|
8775
|
+
function accumulateUsage(agg, u) {
|
|
8776
|
+
agg.input += u.input;
|
|
8777
|
+
agg.output += u.output;
|
|
8778
|
+
agg.cacheRead += u.cacheRead;
|
|
8779
|
+
agg.cost += u.cost.total;
|
|
8780
|
+
}
|
|
8781
|
+
function resolvePiModel(args) {
|
|
8782
|
+
const { providerName, apiId, modelId, baseUrl } = args;
|
|
8783
|
+
let model;
|
|
8784
|
+
try {
|
|
8785
|
+
model = (0, import_pi_ai.getModel)(providerName, modelId);
|
|
8786
|
+
} catch {
|
|
8787
|
+
model = void 0;
|
|
8788
|
+
}
|
|
8789
|
+
if (!model) {
|
|
8790
|
+
const fallbackBaseUrl = baseUrl ?? defaultBaseUrlFor(providerName);
|
|
8791
|
+
if (!fallbackBaseUrl) {
|
|
8721
8792
|
throw new Error(
|
|
8722
|
-
`
|
|
8793
|
+
`pi-ai adapter cannot resolve a baseUrl for provider '${providerName}' / model '${modelId}'. Either set the target's baseUrl/endpoint or use a model id pi-ai recognizes.`
|
|
8723
8794
|
);
|
|
8724
|
-
|
|
8725
|
-
|
|
8726
|
-
|
|
8727
|
-
|
|
8728
|
-
|
|
8729
|
-
|
|
8730
|
-
|
|
8731
|
-
|
|
8732
|
-
|
|
8733
|
-
|
|
8734
|
-
|
|
8735
|
-
|
|
8736
|
-
id;
|
|
8737
|
-
kind = "agentv";
|
|
8738
|
-
targetName;
|
|
8739
|
-
model;
|
|
8740
|
-
constructor(targetName, config) {
|
|
8741
|
-
this.id = `agentv:${targetName}`;
|
|
8742
|
-
this.targetName = targetName;
|
|
8743
|
-
this.model = createLanguageModel(config.model);
|
|
8744
|
-
}
|
|
8745
|
-
/**
|
|
8746
|
-
* Direct invoke is not supported for the agentv provider.
|
|
8747
|
-
* Use asLanguageModel() with generateText() instead.
|
|
8748
|
-
*/
|
|
8749
|
-
async invoke(_request) {
|
|
8750
|
-
throw new Error(
|
|
8751
|
-
"AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead."
|
|
8752
|
-
);
|
|
8753
|
-
}
|
|
8754
|
-
/**
|
|
8755
|
-
* Returns the resolved AI SDK LanguageModel for use with generateText/generateObject.
|
|
8756
|
-
*/
|
|
8757
|
-
asLanguageModel() {
|
|
8758
|
-
return this.model;
|
|
8759
|
-
}
|
|
8795
|
+
}
|
|
8796
|
+
model = {
|
|
8797
|
+
id: modelId,
|
|
8798
|
+
name: modelId,
|
|
8799
|
+
api: apiId,
|
|
8800
|
+
provider: providerName,
|
|
8801
|
+
baseUrl: fallbackBaseUrl,
|
|
8802
|
+
reasoning: false,
|
|
8803
|
+
input: ["text"],
|
|
8804
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
8805
|
+
contextWindow: 128e3,
|
|
8806
|
+
maxTokens: 16384
|
|
8760
8807
|
};
|
|
8761
8808
|
}
|
|
8762
|
-
|
|
8763
|
-
|
|
8764
|
-
|
|
8765
|
-
|
|
8766
|
-
|
|
8767
|
-
apiKey: config.apiKey,
|
|
8768
|
-
apiVersion: config.version,
|
|
8769
|
-
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
8770
|
-
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
8771
|
-
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
8772
|
-
};
|
|
8773
|
-
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
8774
|
-
if (baseURL) {
|
|
8775
|
-
options.baseURL = baseURL;
|
|
8776
|
-
} else {
|
|
8777
|
-
options.resourceName = config.resourceName;
|
|
8809
|
+
if (model.api !== apiId) {
|
|
8810
|
+
model = { ...model, api: apiId };
|
|
8811
|
+
}
|
|
8812
|
+
if (baseUrl) {
|
|
8813
|
+
model = { ...model, baseUrl };
|
|
8778
8814
|
}
|
|
8779
|
-
return
|
|
8815
|
+
return model;
|
|
8780
8816
|
}
|
|
8781
|
-
function
|
|
8782
|
-
|
|
8783
|
-
if (
|
|
8784
|
-
|
|
8817
|
+
function defaultBaseUrlFor(providerName) {
|
|
8818
|
+
if (providerName === "openai") return "https://api.openai.com/v1";
|
|
8819
|
+
if (providerName === "openrouter") return "https://openrouter.ai/api/v1";
|
|
8820
|
+
return void 0;
|
|
8821
|
+
}
|
|
8822
|
+
function chatPromptToPiContext(chatPrompt) {
|
|
8823
|
+
const systemSegments = [];
|
|
8824
|
+
const messages = [];
|
|
8825
|
+
const now = Date.now();
|
|
8826
|
+
for (const message of chatPrompt) {
|
|
8827
|
+
if (message.role === "system") {
|
|
8828
|
+
systemSegments.push(message.content);
|
|
8829
|
+
continue;
|
|
8830
|
+
}
|
|
8831
|
+
if (message.role === "user") {
|
|
8832
|
+
messages.push({ role: "user", content: message.content, timestamp: now });
|
|
8833
|
+
continue;
|
|
8834
|
+
}
|
|
8835
|
+
if (message.role === "assistant") {
|
|
8836
|
+
messages.push({
|
|
8837
|
+
role: "assistant",
|
|
8838
|
+
content: [{ type: "text", text: message.content }],
|
|
8839
|
+
api: "",
|
|
8840
|
+
provider: "",
|
|
8841
|
+
model: "",
|
|
8842
|
+
usage: {
|
|
8843
|
+
input: 0,
|
|
8844
|
+
output: 0,
|
|
8845
|
+
cacheRead: 0,
|
|
8846
|
+
cacheWrite: 0,
|
|
8847
|
+
totalTokens: 0,
|
|
8848
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
|
|
8849
|
+
},
|
|
8850
|
+
stopReason: "stop",
|
|
8851
|
+
timestamp: now
|
|
8852
|
+
});
|
|
8853
|
+
continue;
|
|
8854
|
+
}
|
|
8855
|
+
if (message.role === "tool" || message.role === "function") {
|
|
8856
|
+
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
8857
|
+
messages.push({
|
|
8858
|
+
role: "assistant",
|
|
8859
|
+
content: [{ type: "text", text: `${prefix}${message.content}` }],
|
|
8860
|
+
api: "",
|
|
8861
|
+
provider: "",
|
|
8862
|
+
model: "",
|
|
8863
|
+
usage: {
|
|
8864
|
+
input: 0,
|
|
8865
|
+
output: 0,
|
|
8866
|
+
cacheRead: 0,
|
|
8867
|
+
cacheWrite: 0,
|
|
8868
|
+
totalTokens: 0,
|
|
8869
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }
|
|
8870
|
+
},
|
|
8871
|
+
stopReason: "stop",
|
|
8872
|
+
timestamp: now
|
|
8873
|
+
});
|
|
8874
|
+
continue;
|
|
8875
|
+
}
|
|
8876
|
+
throw new Error(`pi-ai adapter received unsupported message role '${message.role}'.`);
|
|
8785
8877
|
}
|
|
8786
|
-
|
|
8787
|
-
|
|
8788
|
-
|
|
8878
|
+
return {
|
|
8879
|
+
systemPrompt: systemSegments.length > 0 ? systemSegments.join("\n\n") : void 0,
|
|
8880
|
+
messages
|
|
8881
|
+
};
|
|
8789
8882
|
}
|
|
8790
|
-
function
|
|
8791
|
-
if (
|
|
8792
|
-
|
|
8883
|
+
function attachImagesToLastUserMessage(messages, images) {
|
|
8884
|
+
if (!images || images.length === 0) return;
|
|
8885
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
8886
|
+
const m = messages[i];
|
|
8887
|
+
if (m.role !== "user") continue;
|
|
8888
|
+
const text = typeof m.content === "string" ? m.content : "";
|
|
8889
|
+
messages[i] = {
|
|
8890
|
+
...m,
|
|
8891
|
+
content: [
|
|
8892
|
+
...text ? [{ type: "text", text }] : [],
|
|
8893
|
+
...images.map((img) => ({
|
|
8894
|
+
type: "image",
|
|
8895
|
+
data: img.source,
|
|
8896
|
+
mimeType: img.media_type
|
|
8897
|
+
}))
|
|
8898
|
+
]
|
|
8899
|
+
};
|
|
8900
|
+
return;
|
|
8793
8901
|
}
|
|
8902
|
+
messages.push({
|
|
8903
|
+
role: "user",
|
|
8904
|
+
content: images.map((img) => ({
|
|
8905
|
+
type: "image",
|
|
8906
|
+
data: img.source,
|
|
8907
|
+
mimeType: img.media_type
|
|
8908
|
+
})),
|
|
8909
|
+
timestamp: Date.now()
|
|
8910
|
+
});
|
|
8911
|
+
}
|
|
8912
|
+
function mapPiResponse(result, timing) {
|
|
8913
|
+
const text = result.content.filter((b) => b.type === "text").map((b) => b.text).join("");
|
|
8914
|
+
const cached = timing.aggregateUsage.cacheRead > 0 ? timing.aggregateUsage.cacheRead : void 0;
|
|
8915
|
+
const tokenUsage = {
|
|
8916
|
+
input: timing.aggregateUsage.input,
|
|
8917
|
+
output: timing.aggregateUsage.output,
|
|
8918
|
+
...cached !== void 0 ? { cached } : {}
|
|
8919
|
+
};
|
|
8920
|
+
const costUsd = timing.aggregateUsage.cost > 0 ? timing.aggregateUsage.cost : void 0;
|
|
8794
8921
|
return {
|
|
8795
|
-
|
|
8796
|
-
|
|
8797
|
-
|
|
8798
|
-
|
|
8799
|
-
|
|
8800
|
-
|
|
8922
|
+
raw: result,
|
|
8923
|
+
usage: toJsonObject(result.usage),
|
|
8924
|
+
output: [{ role: "assistant", content: text }],
|
|
8925
|
+
tokenUsage,
|
|
8926
|
+
...costUsd !== void 0 ? { costUsd } : {},
|
|
8927
|
+
durationMs: timing.durationMs,
|
|
8928
|
+
startTime: timing.startTime,
|
|
8929
|
+
endTime: timing.endTime,
|
|
8930
|
+
...timing.steps ? { steps: timing.steps } : {}
|
|
8801
8931
|
};
|
|
8802
8932
|
}
|
|
8803
8933
|
function buildChatPrompt(request) {
|
|
@@ -8812,92 +8942,21 @@ function buildChatPrompt(request) {
|
|
|
8812
8942
|
}
|
|
8813
8943
|
const systemContent = resolveSystemContent(request);
|
|
8814
8944
|
const userContent = request.question.trim();
|
|
8815
|
-
|
|
8945
|
+
return [
|
|
8816
8946
|
{ role: "system", content: systemContent },
|
|
8817
8947
|
{ role: "user", content: userContent }
|
|
8818
8948
|
];
|
|
8819
|
-
return prompt;
|
|
8820
8949
|
}
|
|
8821
8950
|
function resolveSystemContent(request) {
|
|
8822
|
-
const systemSegments = [];
|
|
8823
8951
|
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
8824
|
-
|
|
8825
|
-
} else {
|
|
8826
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
8952
|
+
return request.systemPrompt.trim();
|
|
8827
8953
|
}
|
|
8828
|
-
return
|
|
8829
|
-
}
|
|
8830
|
-
function toModelMessages(chatPrompt) {
|
|
8831
|
-
return chatPrompt.map((message) => {
|
|
8832
|
-
if (message.role === "tool" || message.role === "function") {
|
|
8833
|
-
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
8834
|
-
return {
|
|
8835
|
-
role: "assistant",
|
|
8836
|
-
content: `${prefix}${message.content}`
|
|
8837
|
-
};
|
|
8838
|
-
}
|
|
8839
|
-
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
8840
|
-
return {
|
|
8841
|
-
role: message.role,
|
|
8842
|
-
content: message.content
|
|
8843
|
-
};
|
|
8844
|
-
}
|
|
8845
|
-
return {
|
|
8846
|
-
role: "user",
|
|
8847
|
-
content: message.content
|
|
8848
|
-
};
|
|
8849
|
-
});
|
|
8954
|
+
return DEFAULT_SYSTEM_PROMPT;
|
|
8850
8955
|
}
|
|
8851
8956
|
function resolveModelSettings(request, defaults) {
|
|
8852
|
-
const temperature = request.temperature ?? defaults.temperature;
|
|
8853
|
-
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
8854
8957
|
return {
|
|
8855
|
-
temperature,
|
|
8856
|
-
maxOutputTokens
|
|
8857
|
-
};
|
|
8858
|
-
}
|
|
8859
|
-
async function invokeModel(options) {
|
|
8860
|
-
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
8861
|
-
const chatPrompt = buildChatPrompt(request);
|
|
8862
|
-
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
8863
|
-
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8864
|
-
const startMs = Date.now();
|
|
8865
|
-
const result = await withRetry(
|
|
8866
|
-
() => (0, import_ai3.generateText)({
|
|
8867
|
-
model,
|
|
8868
|
-
messages: toModelMessages(chatPrompt),
|
|
8869
|
-
temperature,
|
|
8870
|
-
maxOutputTokens,
|
|
8871
|
-
maxRetries: 0,
|
|
8872
|
-
abortSignal: request.signal,
|
|
8873
|
-
...providerOptions ? { providerOptions } : {}
|
|
8874
|
-
}),
|
|
8875
|
-
retryConfig,
|
|
8876
|
-
request.signal
|
|
8877
|
-
);
|
|
8878
|
-
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8879
|
-
const durationMs = Date.now() - startMs;
|
|
8880
|
-
return mapResponse(result, { durationMs, startTime, endTime });
|
|
8881
|
-
}
|
|
8882
|
-
function mapResponse(result, timing) {
|
|
8883
|
-
const content = result.text ?? "";
|
|
8884
|
-
const rawUsage = result.totalUsage ?? result.usage;
|
|
8885
|
-
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
8886
|
-
const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
|
|
8887
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
|
|
8888
|
-
input: rawUsage.inputTokens,
|
|
8889
|
-
output: rawUsage.outputTokens,
|
|
8890
|
-
...reasoning != null ? { reasoning } : {},
|
|
8891
|
-
...cached != null ? { cached } : {}
|
|
8892
|
-
} : void 0;
|
|
8893
|
-
return {
|
|
8894
|
-
raw: result,
|
|
8895
|
-
usage: toJsonObject(rawUsage),
|
|
8896
|
-
output: [{ role: "assistant", content }],
|
|
8897
|
-
tokenUsage,
|
|
8898
|
-
durationMs: timing?.durationMs,
|
|
8899
|
-
startTime: timing?.startTime,
|
|
8900
|
-
endTime: timing?.endTime
|
|
8958
|
+
temperature: request.temperature ?? defaults.temperature,
|
|
8959
|
+
maxOutputTokens: request.maxOutputTokens ?? defaults.maxOutputTokens
|
|
8901
8960
|
};
|
|
8902
8961
|
}
|
|
8903
8962
|
function toJsonObject(value) {
|
|
@@ -8911,9 +8970,7 @@ function toJsonObject(value) {
|
|
|
8911
8970
|
}
|
|
8912
8971
|
}
|
|
8913
8972
|
function extractStatus(error) {
|
|
8914
|
-
if (!error || typeof error !== "object")
|
|
8915
|
-
return void 0;
|
|
8916
|
-
}
|
|
8973
|
+
if (!error || typeof error !== "object") return void 0;
|
|
8917
8974
|
const candidate = error;
|
|
8918
8975
|
const directStatus = candidate.status ?? candidate.statusCode;
|
|
8919
8976
|
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
@@ -8928,21 +8985,15 @@ function extractStatus(error) {
|
|
|
8928
8985
|
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
8929
8986
|
if (match) {
|
|
8930
8987
|
const parsed = Number.parseInt(match[1], 10);
|
|
8931
|
-
if (Number.isFinite(parsed))
|
|
8932
|
-
return parsed;
|
|
8933
|
-
}
|
|
8988
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
8934
8989
|
}
|
|
8935
8990
|
}
|
|
8936
8991
|
return void 0;
|
|
8937
8992
|
}
|
|
8938
8993
|
function isNetworkError(error) {
|
|
8939
|
-
if (!error || typeof error !== "object")
|
|
8940
|
-
return false;
|
|
8941
|
-
}
|
|
8994
|
+
if (!error || typeof error !== "object") return false;
|
|
8942
8995
|
const candidate = error;
|
|
8943
|
-
if (candidate.name === "AbortError")
|
|
8944
|
-
return false;
|
|
8945
|
-
}
|
|
8996
|
+
if (candidate.name === "AbortError") return false;
|
|
8946
8997
|
const code = candidate.code;
|
|
8947
8998
|
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
8948
8999
|
return true;
|
|
@@ -8955,12 +9006,8 @@ function isNetworkError(error) {
|
|
|
8955
9006
|
}
|
|
8956
9007
|
function isRetryableError(error, retryableStatusCodes) {
|
|
8957
9008
|
const status = extractStatus(error);
|
|
8958
|
-
if (status === 401 || status === 403)
|
|
8959
|
-
|
|
8960
|
-
}
|
|
8961
|
-
if (typeof status === "number") {
|
|
8962
|
-
return retryableStatusCodes.includes(status);
|
|
8963
|
-
}
|
|
9009
|
+
if (status === 401 || status === 403) return false;
|
|
9010
|
+
if (typeof status === "number") return retryableStatusCodes.includes(status);
|
|
8964
9011
|
return isNetworkError(error);
|
|
8965
9012
|
}
|
|
8966
9013
|
function calculateRetryDelay(attempt, config) {
|
|
@@ -8990,195 +9037,266 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
8990
9037
|
return await fn();
|
|
8991
9038
|
} catch (error) {
|
|
8992
9039
|
lastError = error;
|
|
8993
|
-
if (attempt >= config.maxRetries)
|
|
8994
|
-
|
|
8995
|
-
}
|
|
8996
|
-
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
8997
|
-
throw error;
|
|
8998
|
-
}
|
|
9040
|
+
if (attempt >= config.maxRetries) break;
|
|
9041
|
+
if (!isRetryableError(error, config.retryableStatusCodes)) throw error;
|
|
8999
9042
|
const delay = calculateRetryDelay(attempt, config);
|
|
9000
9043
|
await sleep(delay);
|
|
9001
9044
|
}
|
|
9002
9045
|
}
|
|
9003
9046
|
throw lastError;
|
|
9004
9047
|
}
|
|
9005
|
-
var
|
|
9006
|
-
var
|
|
9007
|
-
"src/evaluation/providers/
|
|
9048
|
+
var import_pi_ai, DEFAULT_SYSTEM_PROMPT, OpenAIProvider, OpenRouterProvider, AnthropicProvider, GeminiProvider, AzureProvider;
|
|
9049
|
+
var init_llm_providers = __esm({
|
|
9050
|
+
"src/evaluation/providers/llm-providers.ts"() {
|
|
9008
9051
|
"use strict";
|
|
9009
9052
|
init_cjs_shims();
|
|
9010
|
-
|
|
9011
|
-
|
|
9012
|
-
import_google2 = require("@ai-sdk/google");
|
|
9013
|
-
import_openai2 = require("@ai-sdk/openai");
|
|
9014
|
-
import_ai_sdk_provider = require("@openrouter/ai-sdk-provider");
|
|
9015
|
-
import_ai3 = require("ai");
|
|
9053
|
+
import_pi_ai = require("@mariozechner/pi-ai");
|
|
9054
|
+
(0, import_pi_ai.registerBuiltInApiProviders)();
|
|
9016
9055
|
DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
9017
9056
|
OpenAIProvider = class {
|
|
9018
|
-
constructor(targetName, config) {
|
|
9019
|
-
this.config = config;
|
|
9020
|
-
this.id = `openai:${targetName}`;
|
|
9021
|
-
this.targetName = targetName;
|
|
9022
|
-
this.defaults = {
|
|
9023
|
-
temperature: config.temperature,
|
|
9024
|
-
maxOutputTokens: config.maxOutputTokens
|
|
9025
|
-
};
|
|
9026
|
-
this.retryConfig = config.retry;
|
|
9027
|
-
const openai = (0, import_openai2.createOpenAI)({
|
|
9028
|
-
apiKey: config.apiKey,
|
|
9029
|
-
baseURL: config.baseURL
|
|
9030
|
-
});
|
|
9031
|
-
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
9032
|
-
}
|
|
9033
9057
|
id;
|
|
9034
9058
|
kind = "openai";
|
|
9035
9059
|
targetName;
|
|
9036
|
-
|
|
9060
|
+
piModel;
|
|
9037
9061
|
defaults;
|
|
9038
9062
|
retryConfig;
|
|
9039
|
-
|
|
9040
|
-
return invokeModel({
|
|
9041
|
-
model: this.model,
|
|
9042
|
-
request,
|
|
9043
|
-
defaults: this.defaults,
|
|
9044
|
-
retryConfig: this.retryConfig
|
|
9045
|
-
});
|
|
9046
|
-
}
|
|
9047
|
-
asLanguageModel() {
|
|
9048
|
-
return this.model;
|
|
9049
|
-
}
|
|
9050
|
-
};
|
|
9051
|
-
AzureProvider = class {
|
|
9063
|
+
apiKey;
|
|
9052
9064
|
constructor(targetName, config) {
|
|
9053
|
-
this.
|
|
9054
|
-
this.id = `azure:${targetName}`;
|
|
9065
|
+
this.id = `openai:${targetName}`;
|
|
9055
9066
|
this.targetName = targetName;
|
|
9067
|
+
this.apiKey = config.apiKey;
|
|
9056
9068
|
this.defaults = {
|
|
9057
9069
|
temperature: config.temperature,
|
|
9058
9070
|
maxOutputTokens: config.maxOutputTokens
|
|
9059
9071
|
};
|
|
9060
9072
|
this.retryConfig = config.retry;
|
|
9061
|
-
|
|
9062
|
-
|
|
9073
|
+
this.piModel = resolvePiModel({
|
|
9074
|
+
providerName: "openai",
|
|
9075
|
+
apiId: config.apiFormat === "responses" ? "openai-responses" : "openai-completions",
|
|
9076
|
+
modelId: config.model,
|
|
9077
|
+
baseUrl: config.baseURL
|
|
9078
|
+
});
|
|
9063
9079
|
}
|
|
9064
|
-
id;
|
|
9065
|
-
kind = "azure";
|
|
9066
|
-
targetName;
|
|
9067
|
-
model;
|
|
9068
|
-
defaults;
|
|
9069
|
-
retryConfig;
|
|
9070
9080
|
async invoke(request) {
|
|
9071
|
-
return
|
|
9072
|
-
model: this.
|
|
9081
|
+
return invokePiAi({
|
|
9082
|
+
model: this.piModel,
|
|
9083
|
+
apiKey: this.apiKey,
|
|
9073
9084
|
request,
|
|
9074
9085
|
defaults: this.defaults,
|
|
9075
9086
|
retryConfig: this.retryConfig
|
|
9076
9087
|
});
|
|
9077
9088
|
}
|
|
9078
|
-
asLanguageModel() {
|
|
9079
|
-
return this.model;
|
|
9080
|
-
}
|
|
9081
9089
|
};
|
|
9082
9090
|
OpenRouterProvider = class {
|
|
9091
|
+
id;
|
|
9092
|
+
kind = "openrouter";
|
|
9093
|
+
targetName;
|
|
9094
|
+
piModel;
|
|
9095
|
+
defaults;
|
|
9096
|
+
retryConfig;
|
|
9097
|
+
apiKey;
|
|
9083
9098
|
constructor(targetName, config) {
|
|
9084
|
-
this.config = config;
|
|
9085
9099
|
this.id = `openrouter:${targetName}`;
|
|
9086
9100
|
this.targetName = targetName;
|
|
9101
|
+
this.apiKey = config.apiKey;
|
|
9087
9102
|
this.defaults = {
|
|
9088
9103
|
temperature: config.temperature,
|
|
9089
9104
|
maxOutputTokens: config.maxOutputTokens
|
|
9090
9105
|
};
|
|
9091
9106
|
this.retryConfig = config.retry;
|
|
9092
|
-
|
|
9093
|
-
|
|
9107
|
+
this.piModel = resolvePiModel({
|
|
9108
|
+
providerName: "openrouter",
|
|
9109
|
+
apiId: "openai-completions",
|
|
9110
|
+
modelId: config.model,
|
|
9111
|
+
baseUrl: "https://openrouter.ai/api/v1"
|
|
9094
9112
|
});
|
|
9095
|
-
this.model = openrouter(config.model);
|
|
9096
9113
|
}
|
|
9097
|
-
id;
|
|
9098
|
-
kind = "openrouter";
|
|
9099
|
-
targetName;
|
|
9100
|
-
model;
|
|
9101
|
-
defaults;
|
|
9102
|
-
retryConfig;
|
|
9103
9114
|
async invoke(request) {
|
|
9104
|
-
return
|
|
9105
|
-
model: this.
|
|
9115
|
+
return invokePiAi({
|
|
9116
|
+
model: this.piModel,
|
|
9117
|
+
apiKey: this.apiKey,
|
|
9106
9118
|
request,
|
|
9107
9119
|
defaults: this.defaults,
|
|
9108
9120
|
retryConfig: this.retryConfig
|
|
9109
9121
|
});
|
|
9110
9122
|
}
|
|
9111
|
-
asLanguageModel() {
|
|
9112
|
-
return this.model;
|
|
9113
|
-
}
|
|
9114
9123
|
};
|
|
9115
9124
|
AnthropicProvider = class {
|
|
9125
|
+
id;
|
|
9126
|
+
kind = "anthropic";
|
|
9127
|
+
targetName;
|
|
9128
|
+
piModel;
|
|
9129
|
+
defaults;
|
|
9130
|
+
retryConfig;
|
|
9131
|
+
apiKey;
|
|
9132
|
+
thinkingBudget;
|
|
9116
9133
|
constructor(targetName, config) {
|
|
9117
|
-
this.config = config;
|
|
9118
9134
|
this.id = `anthropic:${targetName}`;
|
|
9119
9135
|
this.targetName = targetName;
|
|
9136
|
+
this.apiKey = config.apiKey;
|
|
9137
|
+
this.thinkingBudget = config.thinkingBudget;
|
|
9120
9138
|
this.defaults = {
|
|
9121
9139
|
temperature: config.temperature,
|
|
9122
9140
|
maxOutputTokens: config.maxOutputTokens,
|
|
9123
9141
|
thinkingBudget: config.thinkingBudget
|
|
9124
9142
|
};
|
|
9125
9143
|
this.retryConfig = config.retry;
|
|
9126
|
-
|
|
9127
|
-
|
|
9144
|
+
this.piModel = resolvePiModel({
|
|
9145
|
+
providerName: "anthropic",
|
|
9146
|
+
apiId: "anthropic-messages",
|
|
9147
|
+
modelId: config.model
|
|
9128
9148
|
});
|
|
9129
|
-
this.model = anthropic(config.model);
|
|
9130
9149
|
}
|
|
9131
|
-
id;
|
|
9132
|
-
kind = "anthropic";
|
|
9133
|
-
targetName;
|
|
9134
|
-
model;
|
|
9135
|
-
defaults;
|
|
9136
|
-
retryConfig;
|
|
9137
9150
|
async invoke(request) {
|
|
9138
|
-
const providerOptions =
|
|
9139
|
-
return
|
|
9140
|
-
model: this.
|
|
9151
|
+
const providerOptions = this.thinkingBudget !== void 0 ? { thinkingEnabled: true, thinkingBudgetTokens: this.thinkingBudget } : void 0;
|
|
9152
|
+
return invokePiAi({
|
|
9153
|
+
model: this.piModel,
|
|
9154
|
+
apiKey: this.apiKey,
|
|
9141
9155
|
request,
|
|
9142
9156
|
defaults: this.defaults,
|
|
9143
9157
|
retryConfig: this.retryConfig,
|
|
9144
|
-
providerOptions
|
|
9158
|
+
...providerOptions ? { providerOptions } : {}
|
|
9145
9159
|
});
|
|
9146
9160
|
}
|
|
9147
|
-
asLanguageModel() {
|
|
9148
|
-
return this.model;
|
|
9149
|
-
}
|
|
9150
9161
|
};
|
|
9151
9162
|
GeminiProvider = class {
|
|
9163
|
+
id;
|
|
9164
|
+
kind = "gemini";
|
|
9165
|
+
targetName;
|
|
9166
|
+
piModel;
|
|
9167
|
+
defaults;
|
|
9168
|
+
retryConfig;
|
|
9169
|
+
apiKey;
|
|
9152
9170
|
constructor(targetName, config) {
|
|
9153
|
-
this.config = config;
|
|
9154
9171
|
this.id = `gemini:${targetName}`;
|
|
9155
9172
|
this.targetName = targetName;
|
|
9173
|
+
this.apiKey = config.apiKey;
|
|
9156
9174
|
this.defaults = {
|
|
9157
9175
|
temperature: config.temperature,
|
|
9158
9176
|
maxOutputTokens: config.maxOutputTokens
|
|
9159
9177
|
};
|
|
9160
9178
|
this.retryConfig = config.retry;
|
|
9161
|
-
|
|
9162
|
-
|
|
9179
|
+
this.piModel = resolvePiModel({
|
|
9180
|
+
providerName: "google",
|
|
9181
|
+
apiId: "google-generative-ai",
|
|
9182
|
+
modelId: config.model
|
|
9183
|
+
});
|
|
9184
|
+
}
|
|
9185
|
+
async invoke(request) {
|
|
9186
|
+
return invokePiAi({
|
|
9187
|
+
model: this.piModel,
|
|
9188
|
+
apiKey: this.apiKey,
|
|
9189
|
+
request,
|
|
9190
|
+
defaults: this.defaults,
|
|
9191
|
+
retryConfig: this.retryConfig
|
|
9163
9192
|
});
|
|
9164
|
-
this.model = google(config.model);
|
|
9165
9193
|
}
|
|
9194
|
+
};
|
|
9195
|
+
AzureProvider = class {
|
|
9166
9196
|
id;
|
|
9167
|
-
kind = "
|
|
9197
|
+
kind = "azure";
|
|
9168
9198
|
targetName;
|
|
9169
|
-
|
|
9199
|
+
piModel;
|
|
9170
9200
|
defaults;
|
|
9171
9201
|
retryConfig;
|
|
9202
|
+
apiKey;
|
|
9203
|
+
providerOptions;
|
|
9204
|
+
constructor(targetName, config) {
|
|
9205
|
+
this.id = `azure:${targetName}`;
|
|
9206
|
+
this.targetName = targetName;
|
|
9207
|
+
this.apiKey = config.apiKey;
|
|
9208
|
+
this.defaults = {
|
|
9209
|
+
temperature: config.temperature,
|
|
9210
|
+
maxOutputTokens: config.maxOutputTokens
|
|
9211
|
+
};
|
|
9212
|
+
this.retryConfig = config.retry;
|
|
9213
|
+
const trimmed = config.resourceName.trim();
|
|
9214
|
+
const isFullUrl = /^https?:\/\//i.test(trimmed);
|
|
9215
|
+
const baseUrl = isFullUrl ? buildAzureBaseUrl(trimmed) : void 0;
|
|
9216
|
+
this.providerOptions = {
|
|
9217
|
+
...baseUrl ? { azureBaseUrl: baseUrl } : { azureResourceName: trimmed },
|
|
9218
|
+
...config.version ? { azureApiVersion: config.version } : {}
|
|
9219
|
+
};
|
|
9220
|
+
this.piModel = resolvePiModel({
|
|
9221
|
+
providerName: "azure-openai-responses",
|
|
9222
|
+
apiId: "azure-openai-responses",
|
|
9223
|
+
// The "model id" for Azure is the deployment name.
|
|
9224
|
+
modelId: config.deploymentName,
|
|
9225
|
+
...baseUrl ? { baseUrl } : {}
|
|
9226
|
+
});
|
|
9227
|
+
}
|
|
9172
9228
|
async invoke(request) {
|
|
9173
|
-
return
|
|
9174
|
-
model: this.
|
|
9229
|
+
return invokePiAi({
|
|
9230
|
+
model: this.piModel,
|
|
9231
|
+
apiKey: this.apiKey,
|
|
9175
9232
|
request,
|
|
9176
9233
|
defaults: this.defaults,
|
|
9177
|
-
retryConfig: this.retryConfig
|
|
9234
|
+
retryConfig: this.retryConfig,
|
|
9235
|
+
providerOptions: this.providerOptions
|
|
9178
9236
|
});
|
|
9179
9237
|
}
|
|
9180
|
-
|
|
9181
|
-
|
|
9238
|
+
};
|
|
9239
|
+
}
|
|
9240
|
+
});
|
|
9241
|
+
|
|
9242
|
+
// src/evaluation/providers/agentv-provider.ts
|
|
9243
|
+
var agentv_provider_exports = {};
|
|
9244
|
+
__export(agentv_provider_exports, {
|
|
9245
|
+
AgentvProvider: () => AgentvProvider
|
|
9246
|
+
});
|
|
9247
|
+
function parseAgentvModel(model) {
|
|
9248
|
+
const colonIndex = model.indexOf(":");
|
|
9249
|
+
if (colonIndex === -1) {
|
|
9250
|
+
throw new Error(
|
|
9251
|
+
`Invalid agentv model "${model}". Expected "provider:model" (e.g., "openai:gpt-5-mini").`
|
|
9252
|
+
);
|
|
9253
|
+
}
|
|
9254
|
+
const provider = model.slice(0, colonIndex);
|
|
9255
|
+
const modelId = model.slice(colonIndex + 1);
|
|
9256
|
+
switch (provider) {
|
|
9257
|
+
case "openai":
|
|
9258
|
+
return { providerName: "openai", apiId: "openai-completions", modelId };
|
|
9259
|
+
case "anthropic":
|
|
9260
|
+
return { providerName: "anthropic", apiId: "anthropic-messages", modelId };
|
|
9261
|
+
case "azure":
|
|
9262
|
+
return {
|
|
9263
|
+
providerName: "azure-openai-responses",
|
|
9264
|
+
apiId: "azure-openai-responses",
|
|
9265
|
+
modelId
|
|
9266
|
+
};
|
|
9267
|
+
case "google":
|
|
9268
|
+
return { providerName: "google", apiId: "google-generative-ai", modelId };
|
|
9269
|
+
default:
|
|
9270
|
+
throw new Error(
|
|
9271
|
+
`Unsupported agentv provider "${provider}" in "${model}". Supported: openai, anthropic, azure, google.`
|
|
9272
|
+
);
|
|
9273
|
+
}
|
|
9274
|
+
}
|
|
9275
|
+
var AgentvProvider;
|
|
9276
|
+
var init_agentv_provider = __esm({
|
|
9277
|
+
"src/evaluation/providers/agentv-provider.ts"() {
|
|
9278
|
+
"use strict";
|
|
9279
|
+
init_cjs_shims();
|
|
9280
|
+
init_llm_providers();
|
|
9281
|
+
AgentvProvider = class {
|
|
9282
|
+
id;
|
|
9283
|
+
kind = "agentv";
|
|
9284
|
+
targetName;
|
|
9285
|
+
piModel;
|
|
9286
|
+
defaults;
|
|
9287
|
+
constructor(targetName, config) {
|
|
9288
|
+
this.id = `agentv:${targetName}`;
|
|
9289
|
+
this.targetName = targetName;
|
|
9290
|
+
const { providerName, apiId, modelId } = parseAgentvModel(config.model);
|
|
9291
|
+
this.piModel = resolvePiModel({ providerName, apiId, modelId });
|
|
9292
|
+
this.defaults = { temperature: config.temperature };
|
|
9293
|
+
}
|
|
9294
|
+
async invoke(request) {
|
|
9295
|
+
return invokePiAi({
|
|
9296
|
+
model: this.piModel,
|
|
9297
|
+
request,
|
|
9298
|
+
defaults: this.defaults
|
|
9299
|
+
});
|
|
9182
9300
|
}
|
|
9183
9301
|
};
|
|
9184
9302
|
}
|
|
@@ -13381,10 +13499,10 @@ function extractToolCallsFromEvents(events) {
|
|
|
13381
13499
|
}
|
|
13382
13500
|
}
|
|
13383
13501
|
const toolCalls = [];
|
|
13384
|
-
for (const [id, { tool
|
|
13502
|
+
for (const [id, { tool, input }] of starts) {
|
|
13385
13503
|
toolCalls.push(
|
|
13386
13504
|
normalizeToolCall("pi-cli", {
|
|
13387
|
-
tool
|
|
13505
|
+
tool,
|
|
13388
13506
|
input,
|
|
13389
13507
|
id: id.startsWith("anon-") ? void 0 : id,
|
|
13390
13508
|
output: results.get(id)
|
|
@@ -17765,7 +17883,6 @@ var init_providers = __esm({
|
|
|
17765
17883
|
"use strict";
|
|
17766
17884
|
init_cjs_shims();
|
|
17767
17885
|
init_agentv_provider();
|
|
17768
|
-
init_ai_sdk();
|
|
17769
17886
|
init_claude_cli();
|
|
17770
17887
|
init_claude_sdk();
|
|
17771
17888
|
init_cli();
|
|
@@ -17773,6 +17890,7 @@ var init_providers = __esm({
|
|
|
17773
17890
|
init_copilot_cli();
|
|
17774
17891
|
init_copilot_log();
|
|
17775
17892
|
init_copilot_sdk();
|
|
17893
|
+
init_llm_providers();
|
|
17776
17894
|
init_mock();
|
|
17777
17895
|
init_pi_cli();
|
|
17778
17896
|
init_pi_coding_agent();
|
|
@@ -19799,6 +19917,19 @@ async function runEvaluation(options) {
|
|
|
19799
19917
|
await dockerSetup.pullImage();
|
|
19800
19918
|
setupLog("Docker image pull complete");
|
|
19801
19919
|
}
|
|
19920
|
+
if (suiteWorkspace?.env) {
|
|
19921
|
+
try {
|
|
19922
|
+
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
19923
|
+
setupLog("preflight checks passed");
|
|
19924
|
+
} catch (error) {
|
|
19925
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
19926
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
19927
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
19928
|
+
});
|
|
19929
|
+
}
|
|
19930
|
+
throw new Error(message);
|
|
19931
|
+
}
|
|
19932
|
+
}
|
|
19802
19933
|
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
19803
19934
|
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
19804
19935
|
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
@@ -22069,6 +22200,38 @@ function computeWeightedMean(entries) {
|
|
|
22069
22200
|
}
|
|
22070
22201
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
22071
22202
|
}
|
|
22203
|
+
async function runPreflightChecks(env, cwd, log) {
|
|
22204
|
+
const execFileAsync5 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
|
|
22205
|
+
const missing = [];
|
|
22206
|
+
for (const cmd of env.required_commands ?? []) {
|
|
22207
|
+
log(`preflight: checking command "${cmd}"`);
|
|
22208
|
+
try {
|
|
22209
|
+
if (process.platform === "win32") {
|
|
22210
|
+
await execFileAsync5("where", [cmd], { cwd });
|
|
22211
|
+
} else {
|
|
22212
|
+
await execFileAsync5("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
22213
|
+
}
|
|
22214
|
+
} catch {
|
|
22215
|
+
missing.push(`command: ${cmd}`);
|
|
22216
|
+
}
|
|
22217
|
+
}
|
|
22218
|
+
for (const mod of env.required_python_modules ?? []) {
|
|
22219
|
+
log(`preflight: checking Python module "${mod}"`);
|
|
22220
|
+
try {
|
|
22221
|
+
await execFileAsync5("python3", ["-c", `import ${mod}`], { cwd });
|
|
22222
|
+
} catch {
|
|
22223
|
+
missing.push(`python module: ${mod}`);
|
|
22224
|
+
}
|
|
22225
|
+
}
|
|
22226
|
+
if (missing.length > 0) {
|
|
22227
|
+
throw new Error(
|
|
22228
|
+
`Preflight checks failed \u2014 missing dependencies:
|
|
22229
|
+
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
22230
|
+
|
|
22231
|
+
Install the missing dependencies before running this eval.`
|
|
22232
|
+
);
|
|
22233
|
+
}
|
|
22234
|
+
}
|
|
22072
22235
|
var import_node_child_process11, import_node_crypto11, import_node_fs16, import_promises35, import_node_path47, import_node_util7, import_micromatch2, execFileAsync3, WORKSPACE_GIT_TIMEOUT_MS;
|
|
22073
22236
|
var init_orchestrator = __esm({
|
|
22074
22237
|
"src/evaluation/orchestrator.ts"() {
|
|
@@ -22931,7 +23094,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
22931
23094
|
const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
|
|
22932
23095
|
const mode = explicitMode ?? (workspacePath ? "static" : void 0);
|
|
22933
23096
|
const docker = parseDockerWorkspaceConfig(obj.docker);
|
|
22934
|
-
|
|
23097
|
+
const env = parseWorkspaceEnvConfig(obj.env);
|
|
23098
|
+
if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
|
|
22935
23099
|
return void 0;
|
|
22936
23100
|
return {
|
|
22937
23101
|
...template !== void 0 && { template },
|
|
@@ -22940,7 +23104,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
22940
23104
|
...hooks !== void 0 && { hooks },
|
|
22941
23105
|
...mode !== void 0 && { mode },
|
|
22942
23106
|
...workspacePath !== void 0 && { path: workspacePath },
|
|
22943
|
-
...docker !== void 0 && { docker }
|
|
23107
|
+
...docker !== void 0 && { docker },
|
|
23108
|
+
...env !== void 0 && { env }
|
|
23109
|
+
};
|
|
23110
|
+
}
|
|
23111
|
+
function parseWorkspaceEnvConfig(raw) {
|
|
23112
|
+
if (!isJsonObject(raw)) return void 0;
|
|
23113
|
+
const obj = raw;
|
|
23114
|
+
const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
|
|
23115
|
+
const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
|
|
23116
|
+
if (!required_commands?.length && !required_python_modules?.length) return void 0;
|
|
23117
|
+
return {
|
|
23118
|
+
...required_commands?.length && { required_commands },
|
|
23119
|
+
...required_python_modules?.length && { required_python_modules }
|
|
22944
23120
|
};
|
|
22945
23121
|
}
|
|
22946
23122
|
function parseDockerWorkspaceConfig(raw) {
|
|
@@ -24966,8 +25142,8 @@ init_cjs_shims();
|
|
|
24966
25142
|
|
|
24967
25143
|
// src/evaluation/generators/rubric-generator.ts
|
|
24968
25144
|
init_cjs_shims();
|
|
24969
|
-
var import_ai4 = require("ai");
|
|
24970
25145
|
var import_zod6 = require("zod");
|
|
25146
|
+
init_types2();
|
|
24971
25147
|
var rubricItemSchema = import_zod6.z.object({
|
|
24972
25148
|
id: import_zod6.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
24973
25149
|
outcome: import_zod6.z.string().describe("Concrete expected outcome for this rubric item"),
|
|
@@ -24980,10 +25156,6 @@ var rubricGenerationSchema = import_zod6.z.object({
|
|
|
24980
25156
|
async function generateRubrics(options) {
|
|
24981
25157
|
const { criteria, question, referenceAnswer, provider } = options;
|
|
24982
25158
|
const prompt = buildPrompt(criteria, question, referenceAnswer);
|
|
24983
|
-
const model = provider.asLanguageModel?.();
|
|
24984
|
-
if (!model) {
|
|
24985
|
-
throw new Error("Provider does not support language model interface");
|
|
24986
|
-
}
|
|
24987
25159
|
const system = `You are an expert at creating evaluation rubrics.
|
|
24988
25160
|
You must return a valid JSON object matching this schema:
|
|
24989
25161
|
{
|
|
@@ -25000,11 +25172,11 @@ You must return a valid JSON object matching this schema:
|
|
|
25000
25172
|
let lastError;
|
|
25001
25173
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
25002
25174
|
try {
|
|
25003
|
-
const
|
|
25004
|
-
|
|
25005
|
-
system
|
|
25006
|
-
prompt
|
|
25175
|
+
const response = await provider.invoke({
|
|
25176
|
+
question: prompt,
|
|
25177
|
+
systemPrompt: system
|
|
25007
25178
|
});
|
|
25179
|
+
const text = extractLastAssistantContent2(response.output);
|
|
25008
25180
|
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
25009
25181
|
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
25010
25182
|
break;
|