agentv 2.10.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -0
- package/dist/{chunk-G3OTPFYX.js → chunk-CVC3VMZ3.js} +149 -14
- package/dist/chunk-CVC3VMZ3.js.map +1 -0
- package/dist/{chunk-RJWTL3VS.js → chunk-EXJWRKKL.js} +741 -176
- package/dist/chunk-EXJWRKKL.js.map +1 -0
- package/dist/{chunk-PC3FAOHT.js → chunk-GO7OTNQ4.js} +109 -9
- package/dist/chunk-GO7OTNQ4.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-BGRU67HI.js → dist-NYXYDALF.js} +18 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-7KFUCBIP.js → interactive-V4A3RRU3.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-G3OTPFYX.js.map +0 -1
- package/dist/chunk-PC3FAOHT.js.map +0 -1
- package/dist/chunk-RJWTL3VS.js.map +0 -1
- /package/dist/{dist-BGRU67HI.js.map → dist-NYXYDALF.js.map} +0 -0
- /package/dist/{interactive-7KFUCBIP.js.map → interactive-V4A3RRU3.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-REN5PS7B.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -632,8 +632,8 @@ function getErrorMap() {
|
|
|
632
632
|
|
|
633
633
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
634
634
|
var makeIssue = (params) => {
|
|
635
|
-
const { data, path:
|
|
636
|
-
const fullPath = [...
|
|
635
|
+
const { data, path: path40, errorMaps, issueData } = params;
|
|
636
|
+
const fullPath = [...path40, ...issueData.path || []];
|
|
637
637
|
const fullIssue = {
|
|
638
638
|
...issueData,
|
|
639
639
|
path: fullPath
|
|
@@ -749,11 +749,11 @@ var errorUtil;
|
|
|
749
749
|
|
|
750
750
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
751
751
|
var ParseInputLazyPath = class {
|
|
752
|
-
constructor(parent, value,
|
|
752
|
+
constructor(parent, value, path40, key) {
|
|
753
753
|
this._cachedPath = [];
|
|
754
754
|
this.parent = parent;
|
|
755
755
|
this.data = value;
|
|
756
|
-
this._path =
|
|
756
|
+
this._path = path40;
|
|
757
757
|
this._key = key;
|
|
758
758
|
}
|
|
759
759
|
get path() {
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-REN5PS7B.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -4255,6 +4255,13 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
4255
4255
|
"execution_metrics",
|
|
4256
4256
|
"agent_judge",
|
|
4257
4257
|
"contains",
|
|
4258
|
+
"contains_any",
|
|
4259
|
+
"contains_all",
|
|
4260
|
+
"icontains",
|
|
4261
|
+
"icontains_any",
|
|
4262
|
+
"icontains_all",
|
|
4263
|
+
"starts_with",
|
|
4264
|
+
"ends_with",
|
|
4258
4265
|
"regex",
|
|
4259
4266
|
"is_json",
|
|
4260
4267
|
"equals",
|
|
@@ -4641,17 +4648,17 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
4641
4648
|
providerBatching,
|
|
4642
4649
|
config: resolveCodexConfig(parsed, env, evalFilePath)
|
|
4643
4650
|
};
|
|
4644
|
-
case "copilot":
|
|
4645
4651
|
case "copilot-sdk":
|
|
4646
4652
|
case "copilot_sdk":
|
|
4647
4653
|
return {
|
|
4648
|
-
kind: "copilot",
|
|
4654
|
+
kind: "copilot-sdk",
|
|
4649
4655
|
name: parsed.name,
|
|
4650
4656
|
judgeTarget: parsed.judge_target,
|
|
4651
4657
|
workers: parsed.workers,
|
|
4652
4658
|
providerBatching,
|
|
4653
4659
|
config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
|
|
4654
4660
|
};
|
|
4661
|
+
case "copilot":
|
|
4655
4662
|
case "copilot-cli":
|
|
4656
4663
|
return {
|
|
4657
4664
|
kind: "copilot-cli",
|
|
@@ -5262,8 +5269,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
5262
5269
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
5263
5270
|
if (!parseResult.success) {
|
|
5264
5271
|
const firstError = parseResult.error.errors[0];
|
|
5265
|
-
const
|
|
5266
|
-
const prefix =
|
|
5272
|
+
const path310 = firstError?.path.join(".") || "";
|
|
5273
|
+
const prefix = path310 ? `${target.name} ${path310}: ` : `${target.name}: `;
|
|
5267
5274
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
5268
5275
|
}
|
|
5269
5276
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -5471,7 +5478,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
5471
5478
|
}
|
|
5472
5479
|
var AGENT_PROVIDER_KINDS = [
|
|
5473
5480
|
"codex",
|
|
5474
|
-
"copilot",
|
|
5481
|
+
"copilot-sdk",
|
|
5475
5482
|
"copilot-cli",
|
|
5476
5483
|
"pi-coding-agent",
|
|
5477
5484
|
"claude",
|
|
@@ -5483,7 +5490,7 @@ var KNOWN_PROVIDERS = [
|
|
|
5483
5490
|
"anthropic",
|
|
5484
5491
|
"gemini",
|
|
5485
5492
|
"codex",
|
|
5486
|
-
"copilot",
|
|
5493
|
+
"copilot-sdk",
|
|
5487
5494
|
"copilot-cli",
|
|
5488
5495
|
"pi-coding-agent",
|
|
5489
5496
|
"pi-agent-sdk",
|
|
@@ -5502,10 +5509,10 @@ var PROVIDER_ALIASES = [
|
|
|
5502
5509
|
// alias for "gemini"
|
|
5503
5510
|
"codex-cli",
|
|
5504
5511
|
// alias for "codex"
|
|
5505
|
-
"copilot
|
|
5506
|
-
// alias for "copilot"
|
|
5512
|
+
"copilot",
|
|
5513
|
+
// alias for "copilot-cli" (default copilot experience)
|
|
5507
5514
|
"copilot_sdk",
|
|
5508
|
-
// alias for "copilot" (underscore variant)
|
|
5515
|
+
// alias for "copilot-sdk" (underscore variant)
|
|
5509
5516
|
"pi",
|
|
5510
5517
|
// alias for "pi-coding-agent"
|
|
5511
5518
|
"claude-code",
|
|
@@ -6654,10 +6661,10 @@ function assignProp(target, prop, value) {
|
|
|
6654
6661
|
configurable: true
|
|
6655
6662
|
});
|
|
6656
6663
|
}
|
|
6657
|
-
function getElementAtPath(obj,
|
|
6658
|
-
if (!
|
|
6664
|
+
function getElementAtPath(obj, path40) {
|
|
6665
|
+
if (!path40)
|
|
6659
6666
|
return obj;
|
|
6660
|
-
return
|
|
6667
|
+
return path40.reduce((acc, key) => acc?.[key], obj);
|
|
6661
6668
|
}
|
|
6662
6669
|
function promiseAllObject(promisesObj) {
|
|
6663
6670
|
const keys = Object.keys(promisesObj);
|
|
@@ -6977,11 +6984,11 @@ function aborted(x, startIndex = 0) {
|
|
|
6977
6984
|
}
|
|
6978
6985
|
return false;
|
|
6979
6986
|
}
|
|
6980
|
-
function prefixIssues(
|
|
6987
|
+
function prefixIssues(path40, issues) {
|
|
6981
6988
|
return issues.map((iss) => {
|
|
6982
6989
|
var _a17;
|
|
6983
6990
|
(_a17 = iss).path ?? (_a17.path = []);
|
|
6984
|
-
iss.path.unshift(
|
|
6991
|
+
iss.path.unshift(path40);
|
|
6985
6992
|
return iss;
|
|
6986
6993
|
});
|
|
6987
6994
|
}
|
|
@@ -7118,7 +7125,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7118
7125
|
return issue2.message;
|
|
7119
7126
|
};
|
|
7120
7127
|
const result = { errors: [] };
|
|
7121
|
-
const processError = (error41,
|
|
7128
|
+
const processError = (error41, path40 = []) => {
|
|
7122
7129
|
var _a17, _b8;
|
|
7123
7130
|
for (const issue2 of error41.issues) {
|
|
7124
7131
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -7128,7 +7135,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7128
7135
|
} else if (issue2.code === "invalid_element") {
|
|
7129
7136
|
processError({ issues: issue2.issues }, issue2.path);
|
|
7130
7137
|
} else {
|
|
7131
|
-
const fullpath = [...
|
|
7138
|
+
const fullpath = [...path40, ...issue2.path];
|
|
7132
7139
|
if (fullpath.length === 0) {
|
|
7133
7140
|
result.errors.push(mapper(issue2));
|
|
7134
7141
|
continue;
|
|
@@ -7158,9 +7165,9 @@ function treeifyError(error40, _mapper) {
|
|
|
7158
7165
|
processError(error40);
|
|
7159
7166
|
return result;
|
|
7160
7167
|
}
|
|
7161
|
-
function toDotPath(
|
|
7168
|
+
function toDotPath(path40) {
|
|
7162
7169
|
const segs = [];
|
|
7163
|
-
for (const seg of
|
|
7170
|
+
for (const seg of path40) {
|
|
7164
7171
|
if (typeof seg === "number")
|
|
7165
7172
|
segs.push(`[${seg}]`);
|
|
7166
7173
|
else if (typeof seg === "symbol")
|
|
@@ -26713,14 +26720,14 @@ function createAzure(options = {}) {
|
|
|
26713
26720
|
description: "Azure OpenAI resource name"
|
|
26714
26721
|
});
|
|
26715
26722
|
const apiVersion = (_a17 = options.apiVersion) != null ? _a17 : "v1";
|
|
26716
|
-
const url2 = ({ path:
|
|
26723
|
+
const url2 = ({ path: path40, modelId }) => {
|
|
26717
26724
|
var _a24;
|
|
26718
26725
|
const baseUrlPrefix = (_a24 = options.baseURL) != null ? _a24 : `https://${getResourceName()}.openai.azure.com/openai`;
|
|
26719
26726
|
let fullUrl;
|
|
26720
26727
|
if (options.useDeploymentBasedUrls) {
|
|
26721
|
-
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${
|
|
26728
|
+
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${path40}`);
|
|
26722
26729
|
} else {
|
|
26723
|
-
fullUrl = new URL(`${baseUrlPrefix}/v1${
|
|
26730
|
+
fullUrl = new URL(`${baseUrlPrefix}/v1${path40}`);
|
|
26724
26731
|
}
|
|
26725
26732
|
fullUrl.searchParams.set("api-version", apiVersion);
|
|
26726
26733
|
return fullUrl.toString();
|
|
@@ -33952,9 +33959,9 @@ import { randomBytes } from "node:crypto";
|
|
|
33952
33959
|
import { createServer } from "node:http";
|
|
33953
33960
|
import fs2 from "node:fs/promises";
|
|
33954
33961
|
import path30 from "node:path";
|
|
33955
|
-
import { createHash, randomUUID as randomUUID7 } from "node:crypto";
|
|
33956
|
-
import { mkdir as
|
|
33957
|
-
import
|
|
33962
|
+
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
33963
|
+
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
33964
|
+
import path37 from "node:path";
|
|
33958
33965
|
import micromatch4 from "micromatch";
|
|
33959
33966
|
import { readFileSync } from "node:fs";
|
|
33960
33967
|
import path31 from "node:path";
|
|
@@ -33967,12 +33974,19 @@ import { promisify as promisify4 } from "node:util";
|
|
|
33967
33974
|
import { cp, mkdir as mkdir10, readdir as readdir3, rm as rm4, stat as stat5 } from "node:fs/promises";
|
|
33968
33975
|
import os3 from "node:os";
|
|
33969
33976
|
import path34 from "node:path";
|
|
33970
|
-
import {
|
|
33971
|
-
import
|
|
33977
|
+
import { execFile } from "node:child_process";
|
|
33978
|
+
import { createHash } from "node:crypto";
|
|
33972
33979
|
import { existsSync as existsSync2 } from "node:fs";
|
|
33973
|
-
import
|
|
33974
|
-
import
|
|
33980
|
+
import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
33981
|
+
import os4 from "node:os";
|
|
33982
|
+
import path35 from "node:path";
|
|
33983
|
+
import { promisify as promisify5 } from "node:util";
|
|
33984
|
+
import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
|
|
33985
|
+
import path36 from "node:path";
|
|
33986
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
33975
33987
|
import path38 from "node:path";
|
|
33988
|
+
import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
|
|
33989
|
+
import path39 from "node:path";
|
|
33976
33990
|
function computeTraceSummary(messages) {
|
|
33977
33991
|
const toolCallCounts = {};
|
|
33978
33992
|
const toolDurations = {};
|
|
@@ -35194,18 +35208,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35194
35208
|
});
|
|
35195
35209
|
continue;
|
|
35196
35210
|
}
|
|
35211
|
+
if (typeValue === "contains_any" || typeValue === "contains_all") {
|
|
35212
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35213
|
+
if (!value || value.length === 0) {
|
|
35214
|
+
logWarning2(
|
|
35215
|
+
`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': value must be a non-empty string array`
|
|
35216
|
+
);
|
|
35217
|
+
continue;
|
|
35218
|
+
}
|
|
35219
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35220
|
+
const required22 = parseRequired(rawEvaluator.required);
|
|
35221
|
+
evaluators.push({
|
|
35222
|
+
name: name16,
|
|
35223
|
+
type: typeValue,
|
|
35224
|
+
value,
|
|
35225
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35226
|
+
...required22 !== void 0 ? { required: required22 } : {},
|
|
35227
|
+
...negate !== void 0 ? { negate } : {}
|
|
35228
|
+
});
|
|
35229
|
+
continue;
|
|
35230
|
+
}
|
|
35231
|
+
if (typeValue === "icontains") {
|
|
35232
|
+
const value = asString(rawEvaluator.value);
|
|
35233
|
+
if (!value) {
|
|
35234
|
+
logWarning2(`Skipping icontains evaluator '${name16}' in '${evalId}': missing value`);
|
|
35235
|
+
continue;
|
|
35236
|
+
}
|
|
35237
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35238
|
+
const required22 = parseRequired(rawEvaluator.required);
|
|
35239
|
+
evaluators.push({
|
|
35240
|
+
name: name16,
|
|
35241
|
+
type: "icontains",
|
|
35242
|
+
value,
|
|
35243
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35244
|
+
...required22 !== void 0 ? { required: required22 } : {},
|
|
35245
|
+
...negate !== void 0 ? { negate } : {}
|
|
35246
|
+
});
|
|
35247
|
+
continue;
|
|
35248
|
+
}
|
|
35249
|
+
if (typeValue === "icontains_any" || typeValue === "icontains_all") {
|
|
35250
|
+
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35251
|
+
if (!value || value.length === 0) {
|
|
35252
|
+
logWarning2(
|
|
35253
|
+
`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': value must be a non-empty string array`
|
|
35254
|
+
);
|
|
35255
|
+
continue;
|
|
35256
|
+
}
|
|
35257
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35258
|
+
const required22 = parseRequired(rawEvaluator.required);
|
|
35259
|
+
evaluators.push({
|
|
35260
|
+
name: name16,
|
|
35261
|
+
type: typeValue,
|
|
35262
|
+
value,
|
|
35263
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35264
|
+
...required22 !== void 0 ? { required: required22 } : {},
|
|
35265
|
+
...negate !== void 0 ? { negate } : {}
|
|
35266
|
+
});
|
|
35267
|
+
continue;
|
|
35268
|
+
}
|
|
35269
|
+
if (typeValue === "starts_with" || typeValue === "ends_with") {
|
|
35270
|
+
const value = asString(rawEvaluator.value);
|
|
35271
|
+
if (!value) {
|
|
35272
|
+
logWarning2(`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': missing value`);
|
|
35273
|
+
continue;
|
|
35274
|
+
}
|
|
35275
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35276
|
+
const required22 = parseRequired(rawEvaluator.required);
|
|
35277
|
+
evaluators.push({
|
|
35278
|
+
name: name16,
|
|
35279
|
+
type: typeValue,
|
|
35280
|
+
value,
|
|
35281
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35282
|
+
...required22 !== void 0 ? { required: required22 } : {},
|
|
35283
|
+
...negate !== void 0 ? { negate } : {}
|
|
35284
|
+
});
|
|
35285
|
+
continue;
|
|
35286
|
+
}
|
|
35197
35287
|
if (typeValue === "regex") {
|
|
35198
35288
|
const value = asString(rawEvaluator.value);
|
|
35199
35289
|
if (!value) {
|
|
35200
35290
|
logWarning2(`Skipping regex evaluator '${name16}' in '${evalId}': missing value`);
|
|
35201
35291
|
continue;
|
|
35202
35292
|
}
|
|
35293
|
+
const flags = asString(rawEvaluator.flags);
|
|
35203
35294
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35204
35295
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35205
35296
|
evaluators.push({
|
|
35206
35297
|
name: name16,
|
|
35207
35298
|
type: "regex",
|
|
35208
35299
|
value,
|
|
35300
|
+
...flags !== void 0 ? { flags } : {},
|
|
35209
35301
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35210
35302
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
35211
35303
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -35378,15 +35470,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35378
35470
|
}
|
|
35379
35471
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
35380
35472
|
}
|
|
35381
|
-
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
35473
|
+
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
35474
|
+
"contains",
|
|
35475
|
+
"contains_any",
|
|
35476
|
+
"contains_all",
|
|
35477
|
+
"icontains",
|
|
35478
|
+
"icontains_any",
|
|
35479
|
+
"icontains_all",
|
|
35480
|
+
"starts_with",
|
|
35481
|
+
"ends_with",
|
|
35482
|
+
"regex",
|
|
35483
|
+
"is_json",
|
|
35484
|
+
"equals",
|
|
35485
|
+
"rubrics"
|
|
35486
|
+
]);
|
|
35382
35487
|
function generateAssertionName(typeValue, rawEvaluator) {
|
|
35383
35488
|
if (!ASSERTION_TYPES.has(typeValue)) {
|
|
35384
35489
|
return void 0;
|
|
35385
35490
|
}
|
|
35386
35491
|
const value = asString(rawEvaluator.value);
|
|
35492
|
+
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
|
|
35387
35493
|
switch (typeValue) {
|
|
35388
35494
|
case "contains":
|
|
35389
35495
|
return value ? `contains-${value}` : "contains";
|
|
35496
|
+
case "contains_any":
|
|
35497
|
+
return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
|
|
35498
|
+
case "contains_all":
|
|
35499
|
+
return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
|
|
35500
|
+
case "icontains":
|
|
35501
|
+
return value ? `icontains-${value}` : "icontains";
|
|
35502
|
+
case "icontains_any":
|
|
35503
|
+
return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
|
|
35504
|
+
case "icontains_all":
|
|
35505
|
+
return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
|
|
35506
|
+
case "starts_with":
|
|
35507
|
+
return value ? `starts_with-${value}` : "starts_with";
|
|
35508
|
+
case "ends_with":
|
|
35509
|
+
return value ? `ends_with-${value}` : "ends_with";
|
|
35390
35510
|
case "regex":
|
|
35391
35511
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
35392
35512
|
case "is_json":
|
|
@@ -35412,6 +35532,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
35412
35532
|
function asString(value) {
|
|
35413
35533
|
return typeof value === "string" ? value : void 0;
|
|
35414
35534
|
}
|
|
35535
|
+
function asStringArrayStrict(value) {
|
|
35536
|
+
if (!Array.isArray(value)) {
|
|
35537
|
+
return void 0;
|
|
35538
|
+
}
|
|
35539
|
+
const result = value.filter((v) => typeof v === "string");
|
|
35540
|
+
return result.length > 0 ? result : void 0;
|
|
35541
|
+
}
|
|
35415
35542
|
function asStringArray(value, description) {
|
|
35416
35543
|
if (value === void 0) {
|
|
35417
35544
|
return void 0;
|
|
@@ -36702,6 +36829,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
|
36702
36829
|
}
|
|
36703
36830
|
return cwd ? { ...config2, cwd } : config2;
|
|
36704
36831
|
}
|
|
36832
|
+
function parseRepoSource(raw) {
|
|
36833
|
+
if (!isJsonObject(raw)) return void 0;
|
|
36834
|
+
const obj = raw;
|
|
36835
|
+
if (obj.type === "git" && typeof obj.url === "string") {
|
|
36836
|
+
return { type: "git", url: obj.url };
|
|
36837
|
+
}
|
|
36838
|
+
if (obj.type === "local" && typeof obj.path === "string") {
|
|
36839
|
+
return { type: "local", path: obj.path };
|
|
36840
|
+
}
|
|
36841
|
+
return void 0;
|
|
36842
|
+
}
|
|
36843
|
+
function parseRepoCheckout(raw) {
|
|
36844
|
+
if (!isJsonObject(raw)) return void 0;
|
|
36845
|
+
const obj = raw;
|
|
36846
|
+
const ref = typeof obj.ref === "string" ? obj.ref : void 0;
|
|
36847
|
+
const resolve2 = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
|
|
36848
|
+
const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
|
|
36849
|
+
if (!ref && !resolve2 && ancestor === void 0) return void 0;
|
|
36850
|
+
return {
|
|
36851
|
+
...ref !== void 0 && { ref },
|
|
36852
|
+
...resolve2 !== void 0 && { resolve: resolve2 },
|
|
36853
|
+
...ancestor !== void 0 && { ancestor }
|
|
36854
|
+
};
|
|
36855
|
+
}
|
|
36856
|
+
function parseRepoClone(raw) {
|
|
36857
|
+
if (!isJsonObject(raw)) return void 0;
|
|
36858
|
+
const obj = raw;
|
|
36859
|
+
const depth = typeof obj.depth === "number" ? obj.depth : void 0;
|
|
36860
|
+
const filter2 = typeof obj.filter === "string" ? obj.filter : void 0;
|
|
36861
|
+
const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
|
|
36862
|
+
if (depth === void 0 && !filter2 && !sparse) return void 0;
|
|
36863
|
+
return {
|
|
36864
|
+
...depth !== void 0 && { depth },
|
|
36865
|
+
...filter2 !== void 0 && { filter: filter2 },
|
|
36866
|
+
...sparse !== void 0 && { sparse }
|
|
36867
|
+
};
|
|
36868
|
+
}
|
|
36869
|
+
function parseRepoConfig(raw) {
|
|
36870
|
+
if (!isJsonObject(raw)) return void 0;
|
|
36871
|
+
const obj = raw;
|
|
36872
|
+
const repoPath = typeof obj.path === "string" ? obj.path : void 0;
|
|
36873
|
+
const source = parseRepoSource(obj.source);
|
|
36874
|
+
if (!repoPath || !source) return void 0;
|
|
36875
|
+
const checkout = parseRepoCheckout(obj.checkout);
|
|
36876
|
+
const clone2 = parseRepoClone(obj.clone);
|
|
36877
|
+
return {
|
|
36878
|
+
path: repoPath,
|
|
36879
|
+
source,
|
|
36880
|
+
...checkout !== void 0 && { checkout },
|
|
36881
|
+
...clone2 !== void 0 && { clone: clone2 }
|
|
36882
|
+
};
|
|
36883
|
+
}
|
|
36884
|
+
function parseResetConfig(raw) {
|
|
36885
|
+
if (!isJsonObject(raw)) return void 0;
|
|
36886
|
+
const obj = raw;
|
|
36887
|
+
const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
|
|
36888
|
+
const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
|
|
36889
|
+
if (!strategy && afterEach === void 0) return void 0;
|
|
36890
|
+
return {
|
|
36891
|
+
...strategy !== void 0 && { strategy },
|
|
36892
|
+
...afterEach !== void 0 && { after_each: afterEach }
|
|
36893
|
+
};
|
|
36894
|
+
}
|
|
36705
36895
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
36706
36896
|
if (!isJsonObject(raw)) return void 0;
|
|
36707
36897
|
const obj = raw;
|
|
@@ -36709,13 +36899,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
36709
36899
|
if (template && !path8.isAbsolute(template)) {
|
|
36710
36900
|
template = path8.resolve(evalFileDir, template);
|
|
36711
36901
|
}
|
|
36902
|
+
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
36903
|
+
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
36904
|
+
const reset = parseResetConfig(obj.reset);
|
|
36712
36905
|
const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
|
|
36713
36906
|
const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
|
|
36714
36907
|
const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
|
|
36715
36908
|
const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
|
|
36716
|
-
if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
36909
|
+
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
36910
|
+
return void 0;
|
|
36717
36911
|
return {
|
|
36718
36912
|
...template !== void 0 && { template },
|
|
36913
|
+
...isolation !== void 0 && { isolation },
|
|
36914
|
+
...repos !== void 0 && { repos },
|
|
36915
|
+
...reset !== void 0 && { reset },
|
|
36719
36916
|
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
36720
36917
|
...afterAll !== void 0 && { after_all: afterAll },
|
|
36721
36918
|
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
@@ -36728,6 +36925,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
36728
36925
|
if (!caseLevel) return suiteLevel;
|
|
36729
36926
|
return {
|
|
36730
36927
|
template: caseLevel.template ?? suiteLevel.template,
|
|
36928
|
+
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
36929
|
+
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
36930
|
+
reset: caseLevel.reset ?? suiteLevel.reset,
|
|
36731
36931
|
before_all: caseLevel.before_all ?? suiteLevel.before_all,
|
|
36732
36932
|
after_all: caseLevel.after_all ?? suiteLevel.after_all,
|
|
36733
36933
|
before_each: caseLevel.before_each ?? suiteLevel.before_each,
|
|
@@ -37248,11 +37448,6 @@ Original error: ${error40 instanceof Error ? error40.message : String(error40)}`
|
|
|
37248
37448
|
}
|
|
37249
37449
|
return claudeSdkModule;
|
|
37250
37450
|
}
|
|
37251
|
-
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
37252
|
-
- Do NOT create any additional output files in the workspace.
|
|
37253
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
37254
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
37255
|
-
This is required for evaluation scoring.`;
|
|
37256
37451
|
var ClaudeProvider = class {
|
|
37257
37452
|
id;
|
|
37258
37453
|
kind = "claude";
|
|
@@ -37274,7 +37469,7 @@ var ClaudeProvider = class {
|
|
|
37274
37469
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
37275
37470
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
37276
37471
|
const prompt = buildPromptDocument(request, inputFiles);
|
|
37277
|
-
const systemPrompt = this.config.systemPrompt
|
|
37472
|
+
const systemPrompt = this.config.systemPrompt;
|
|
37278
37473
|
const queryOptions = {
|
|
37279
37474
|
permissionMode: "bypassPermissions",
|
|
37280
37475
|
allowDangerouslySkipPermissions: true,
|
|
@@ -38237,11 +38432,6 @@ Original error: ${error40 instanceof Error ? error40.message : String(error40)}`
|
|
|
38237
38432
|
}
|
|
38238
38433
|
return codexSdkModule;
|
|
38239
38434
|
}
|
|
38240
|
-
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
38241
|
-
- Do NOT create any additional output files in the workspace.
|
|
38242
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
38243
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
38244
|
-
This is required for evaluation scoring.`;
|
|
38245
38435
|
var CodexProvider = class {
|
|
38246
38436
|
id;
|
|
38247
38437
|
kind = "codex";
|
|
@@ -38276,7 +38466,7 @@ var CodexProvider = class {
|
|
|
38276
38466
|
const thread = codex.startThread(threadOptions);
|
|
38277
38467
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
38278
38468
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
38279
|
-
const systemPrompt = this.config.systemPrompt
|
|
38469
|
+
const systemPrompt = this.config.systemPrompt;
|
|
38280
38470
|
const prompt = systemPrompt ? `${systemPrompt}
|
|
38281
38471
|
|
|
38282
38472
|
${basePrompt}` : basePrompt;
|
|
@@ -38625,7 +38815,7 @@ function subscribeToCopilotCliLogEntries(listener) {
|
|
|
38625
38815
|
};
|
|
38626
38816
|
}
|
|
38627
38817
|
function resolvePlatformCliPath() {
|
|
38628
|
-
const
|
|
38818
|
+
const os5 = platform();
|
|
38629
38819
|
const cpu = arch();
|
|
38630
38820
|
const platformMap = {
|
|
38631
38821
|
linux: "linux",
|
|
@@ -38636,13 +38826,13 @@ function resolvePlatformCliPath() {
|
|
|
38636
38826
|
x64: "x64",
|
|
38637
38827
|
arm64: "arm64"
|
|
38638
38828
|
};
|
|
38639
|
-
const osPart = platformMap[
|
|
38829
|
+
const osPart = platformMap[os5];
|
|
38640
38830
|
const archPart = archMap[cpu];
|
|
38641
38831
|
if (!osPart || !archPart) {
|
|
38642
38832
|
return void 0;
|
|
38643
38833
|
}
|
|
38644
38834
|
const packageName = `@github/copilot-${osPart}-${archPart}`;
|
|
38645
|
-
const binaryName =
|
|
38835
|
+
const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
|
|
38646
38836
|
try {
|
|
38647
38837
|
const resolved = import.meta.resolve(`${packageName}/package.json`);
|
|
38648
38838
|
const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
|
|
@@ -38782,11 +38972,6 @@ function isLogStreamingDisabled(envKey) {
|
|
|
38782
38972
|
const normalized = envValue.trim().toLowerCase();
|
|
38783
38973
|
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
38784
38974
|
}
|
|
38785
|
-
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
38786
|
-
- Do NOT create any additional output files in the workspace.
|
|
38787
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
38788
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
38789
|
-
This is required for evaluation scoring.`;
|
|
38790
38975
|
var CopilotCliProvider = class {
|
|
38791
38976
|
id;
|
|
38792
38977
|
kind = "copilot-cli";
|
|
@@ -38989,8 +39174,8 @@ var CopilotCliProvider = class {
|
|
|
38989
39174
|
}
|
|
38990
39175
|
return args;
|
|
38991
39176
|
}
|
|
38992
|
-
resolveSystemPrompt(
|
|
38993
|
-
return this.config.systemPrompt
|
|
39177
|
+
resolveSystemPrompt(_request) {
|
|
39178
|
+
return this.config.systemPrompt;
|
|
38994
39179
|
}
|
|
38995
39180
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
38996
39181
|
const timeoutMs = this.config.timeoutMs;
|
|
@@ -39169,21 +39354,16 @@ Original error: ${error40 instanceof Error ? error40.message : String(error40)}`
|
|
|
39169
39354
|
}
|
|
39170
39355
|
return copilotSdkModule;
|
|
39171
39356
|
}
|
|
39172
|
-
var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
|
|
39173
|
-
- Do NOT create any additional output files in the workspace.
|
|
39174
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
39175
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
39176
|
-
This is required for evaluation scoring.`;
|
|
39177
39357
|
var CopilotSdkProvider = class {
|
|
39178
39358
|
id;
|
|
39179
|
-
kind = "copilot";
|
|
39359
|
+
kind = "copilot-sdk";
|
|
39180
39360
|
targetName;
|
|
39181
39361
|
supportsBatch = false;
|
|
39182
39362
|
config;
|
|
39183
39363
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
39184
39364
|
client = null;
|
|
39185
39365
|
constructor(targetName, config2) {
|
|
39186
|
-
this.id = `copilot:${targetName}`;
|
|
39366
|
+
this.id = `copilot-sdk:${targetName}`;
|
|
39187
39367
|
this.targetName = targetName;
|
|
39188
39368
|
this.config = config2;
|
|
39189
39369
|
}
|
|
@@ -39206,7 +39386,7 @@ var CopilotSdkProvider = class {
|
|
|
39206
39386
|
if (cwd) {
|
|
39207
39387
|
sessionOptions.workingDirectory = cwd;
|
|
39208
39388
|
}
|
|
39209
|
-
const systemPrompt = this.config.systemPrompt
|
|
39389
|
+
const systemPrompt = this.config.systemPrompt;
|
|
39210
39390
|
if (systemPrompt) {
|
|
39211
39391
|
sessionOptions.systemMessage = {
|
|
39212
39392
|
mode: "append",
|
|
@@ -39706,11 +39886,6 @@ function subscribeToPiLogEntries(listener) {
|
|
|
39706
39886
|
}
|
|
39707
39887
|
var WORKSPACE_PREFIX = "agentv-pi-";
|
|
39708
39888
|
var PROMPT_FILENAME = "prompt.md";
|
|
39709
|
-
var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
|
|
39710
|
-
- Do NOT create any additional output files in the workspace.
|
|
39711
|
-
- All intended file outputs/changes MUST be written in your response.
|
|
39712
|
-
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
39713
|
-
This is required for evaluation scoring.`;
|
|
39714
39889
|
var PiCodingAgentProvider = class {
|
|
39715
39890
|
id;
|
|
39716
39891
|
kind = "pi-coding-agent";
|
|
@@ -39787,7 +39962,7 @@ var PiCodingAgentProvider = class {
|
|
|
39787
39962
|
}
|
|
39788
39963
|
return path16.resolve(this.config.cwd);
|
|
39789
39964
|
}
|
|
39790
|
-
buildPiArgs(prompt, inputFiles,
|
|
39965
|
+
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
39791
39966
|
const args = [];
|
|
39792
39967
|
if (this.config.provider) {
|
|
39793
39968
|
args.push("--provider", this.config.provider);
|
|
@@ -39815,7 +39990,7 @@ var PiCodingAgentProvider = class {
|
|
|
39815
39990
|
args.push(`@${file2}`);
|
|
39816
39991
|
}
|
|
39817
39992
|
}
|
|
39818
|
-
const systemPrompt = this.config.systemPrompt
|
|
39993
|
+
const systemPrompt = this.config.systemPrompt;
|
|
39819
39994
|
const fullPrompt = systemPrompt ? `${systemPrompt}
|
|
39820
39995
|
|
|
39821
39996
|
${prompt}` : prompt;
|
|
@@ -41442,7 +41617,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
41442
41617
|
|
|
41443
41618
|
**IMPORTANT**: Follow these exact steps:
|
|
41444
41619
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
41445
|
-
- Do NOT create any additional output files in the workspace.
|
|
41446
41620
|
- All intended file outputs/changes MUST be written in your response file.
|
|
41447
41621
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
41448
41622
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
@@ -41461,7 +41635,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
41461
41635
|
|
|
41462
41636
|
**IMPORTANT**: Follow these exact steps:
|
|
41463
41637
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
41464
|
-
- Do NOT create any additional output files in the workspace.
|
|
41465
41638
|
- All intended file outputs/changes MUST be written in your response file.
|
|
41466
41639
|
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
41467
41640
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
@@ -41873,7 +42046,7 @@ async function discoverProviders(registry2, baseDir) {
|
|
|
41873
42046
|
}
|
|
41874
42047
|
function createBuiltinProviderRegistry() {
|
|
41875
42048
|
const registry2 = new ProviderRegistry();
|
|
41876
|
-
registry2.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
42049
|
+
registry2.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
|
|
41877
42050
|
"vscode-insiders",
|
|
41878
42051
|
(t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
|
|
41879
42052
|
);
|
|
@@ -42053,16 +42226,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
42053
42226
|
});
|
|
42054
42227
|
}
|
|
42055
42228
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
42056
|
-
const { mkdir:
|
|
42229
|
+
const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
42057
42230
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
42058
|
-
const
|
|
42231
|
+
const path40 = await import("node:path");
|
|
42059
42232
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
42060
|
-
const dir =
|
|
42061
|
-
await
|
|
42062
|
-
const stdinPath =
|
|
42063
|
-
const stdoutPath =
|
|
42064
|
-
const stderrPath =
|
|
42065
|
-
await
|
|
42233
|
+
const dir = path40.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
42234
|
+
await mkdir14(dir, { recursive: true });
|
|
42235
|
+
const stdinPath = path40.join(dir, "stdin.txt");
|
|
42236
|
+
const stdoutPath = path40.join(dir, "stdout.txt");
|
|
42237
|
+
const stderrPath = path40.join(dir, "stderr.txt");
|
|
42238
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
42066
42239
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
42067
42240
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
42068
42241
|
try {
|
|
@@ -42095,7 +42268,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
42095
42268
|
const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
42096
42269
|
return { stdout, stderr, exitCode };
|
|
42097
42270
|
} finally {
|
|
42098
|
-
await
|
|
42271
|
+
await rm6(dir, { recursive: true, force: true });
|
|
42099
42272
|
}
|
|
42100
42273
|
}
|
|
42101
42274
|
var DEFAULT_MAX_CALLS = 50;
|
|
@@ -42405,7 +42578,7 @@ var CodeEvaluator = class {
|
|
|
42405
42578
|
outputPath,
|
|
42406
42579
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
42407
42580
|
inputFiles: context.evalCase.file_paths.filter(
|
|
42408
|
-
(
|
|
42581
|
+
(path40) => !context.evalCase.guideline_paths.includes(path40)
|
|
42409
42582
|
),
|
|
42410
42583
|
input: context.evalCase.input,
|
|
42411
42584
|
trace: context.trace ?? null,
|
|
@@ -42646,13 +42819,15 @@ ${context.fileChanges}`;
|
|
|
42646
42819
|
evaluatorRawRequest,
|
|
42647
42820
|
tokenUsage
|
|
42648
42821
|
};
|
|
42649
|
-
} catch {
|
|
42822
|
+
} catch (e) {
|
|
42823
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
42650
42824
|
return {
|
|
42651
42825
|
score: 0,
|
|
42652
|
-
verdict: "
|
|
42826
|
+
verdict: "skip",
|
|
42653
42827
|
hits: [],
|
|
42654
|
-
misses: [],
|
|
42828
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
42655
42829
|
expectedAspectCount: 1,
|
|
42830
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
42656
42831
|
evaluatorRawRequest
|
|
42657
42832
|
};
|
|
42658
42833
|
}
|
|
@@ -43586,115 +43761,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
43586
43761
|
* Evaluate a single field against the expected value.
|
|
43587
43762
|
*/
|
|
43588
43763
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
43589
|
-
const { path:
|
|
43590
|
-
const candidateValue = resolvePath(candidateData,
|
|
43591
|
-
const expectedValue = resolvePath(expectedData,
|
|
43764
|
+
const { path: path40, match, required: required2 = true, weight = 1 } = fieldConfig;
|
|
43765
|
+
const candidateValue = resolvePath(candidateData, path40);
|
|
43766
|
+
const expectedValue = resolvePath(expectedData, path40);
|
|
43592
43767
|
if (expectedValue === void 0) {
|
|
43593
43768
|
return {
|
|
43594
|
-
path:
|
|
43769
|
+
path: path40,
|
|
43595
43770
|
score: 1,
|
|
43596
43771
|
// No expected value means no comparison needed
|
|
43597
43772
|
weight,
|
|
43598
43773
|
hit: true,
|
|
43599
|
-
message: `${
|
|
43774
|
+
message: `${path40}: no expected value`
|
|
43600
43775
|
};
|
|
43601
43776
|
}
|
|
43602
43777
|
if (candidateValue === void 0) {
|
|
43603
43778
|
if (required2) {
|
|
43604
43779
|
return {
|
|
43605
|
-
path:
|
|
43780
|
+
path: path40,
|
|
43606
43781
|
score: 0,
|
|
43607
43782
|
weight,
|
|
43608
43783
|
hit: false,
|
|
43609
|
-
message: `${
|
|
43784
|
+
message: `${path40} (required, missing)`
|
|
43610
43785
|
};
|
|
43611
43786
|
}
|
|
43612
43787
|
return {
|
|
43613
|
-
path:
|
|
43788
|
+
path: path40,
|
|
43614
43789
|
score: 1,
|
|
43615
43790
|
// Don't penalize missing optional fields
|
|
43616
43791
|
weight: 0,
|
|
43617
43792
|
// Zero weight means it won't affect the score
|
|
43618
43793
|
hit: true,
|
|
43619
|
-
message: `${
|
|
43794
|
+
message: `${path40}: optional field missing`
|
|
43620
43795
|
};
|
|
43621
43796
|
}
|
|
43622
43797
|
switch (match) {
|
|
43623
43798
|
case "exact":
|
|
43624
|
-
return this.compareExact(
|
|
43799
|
+
return this.compareExact(path40, candidateValue, expectedValue, weight);
|
|
43625
43800
|
case "numeric_tolerance":
|
|
43626
43801
|
return this.compareNumericTolerance(
|
|
43627
|
-
|
|
43802
|
+
path40,
|
|
43628
43803
|
candidateValue,
|
|
43629
43804
|
expectedValue,
|
|
43630
43805
|
fieldConfig,
|
|
43631
43806
|
weight
|
|
43632
43807
|
);
|
|
43633
43808
|
case "date":
|
|
43634
|
-
return this.compareDate(
|
|
43809
|
+
return this.compareDate(path40, candidateValue, expectedValue, fieldConfig, weight);
|
|
43635
43810
|
default:
|
|
43636
43811
|
return {
|
|
43637
|
-
path:
|
|
43812
|
+
path: path40,
|
|
43638
43813
|
score: 0,
|
|
43639
43814
|
weight,
|
|
43640
43815
|
hit: false,
|
|
43641
|
-
message: `${
|
|
43816
|
+
message: `${path40}: unknown match type "${match}"`
|
|
43642
43817
|
};
|
|
43643
43818
|
}
|
|
43644
43819
|
}
|
|
43645
43820
|
/**
|
|
43646
43821
|
* Exact equality comparison.
|
|
43647
43822
|
*/
|
|
43648
|
-
compareExact(
|
|
43823
|
+
compareExact(path40, candidateValue, expectedValue, weight) {
|
|
43649
43824
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
43650
43825
|
return {
|
|
43651
|
-
path:
|
|
43826
|
+
path: path40,
|
|
43652
43827
|
score: 1,
|
|
43653
43828
|
weight,
|
|
43654
43829
|
hit: true,
|
|
43655
|
-
message:
|
|
43830
|
+
message: path40
|
|
43656
43831
|
};
|
|
43657
43832
|
}
|
|
43658
43833
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
43659
43834
|
return {
|
|
43660
|
-
path:
|
|
43835
|
+
path: path40,
|
|
43661
43836
|
score: 0,
|
|
43662
43837
|
weight,
|
|
43663
43838
|
hit: false,
|
|
43664
|
-
message: `${
|
|
43839
|
+
message: `${path40} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
43665
43840
|
};
|
|
43666
43841
|
}
|
|
43667
43842
|
return {
|
|
43668
|
-
path:
|
|
43843
|
+
path: path40,
|
|
43669
43844
|
score: 0,
|
|
43670
43845
|
weight,
|
|
43671
43846
|
hit: false,
|
|
43672
|
-
message: `${
|
|
43847
|
+
message: `${path40} (value mismatch)`
|
|
43673
43848
|
};
|
|
43674
43849
|
}
|
|
43675
43850
|
/**
|
|
43676
43851
|
* Numeric comparison with absolute or relative tolerance.
|
|
43677
43852
|
*/
|
|
43678
|
-
compareNumericTolerance(
|
|
43853
|
+
compareNumericTolerance(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
43679
43854
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
43680
43855
|
const candidateNum = toNumber2(candidateValue);
|
|
43681
43856
|
const expectedNum = toNumber2(expectedValue);
|
|
43682
43857
|
if (candidateNum === null || expectedNum === null) {
|
|
43683
43858
|
return {
|
|
43684
|
-
path:
|
|
43859
|
+
path: path40,
|
|
43685
43860
|
score: 0,
|
|
43686
43861
|
weight,
|
|
43687
43862
|
hit: false,
|
|
43688
|
-
message: `${
|
|
43863
|
+
message: `${path40} (non-numeric value)`
|
|
43689
43864
|
};
|
|
43690
43865
|
}
|
|
43691
43866
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
43692
43867
|
return {
|
|
43693
|
-
path:
|
|
43868
|
+
path: path40,
|
|
43694
43869
|
score: 0,
|
|
43695
43870
|
weight,
|
|
43696
43871
|
hit: false,
|
|
43697
|
-
message: `${
|
|
43872
|
+
message: `${path40} (invalid numeric value)`
|
|
43698
43873
|
};
|
|
43699
43874
|
}
|
|
43700
43875
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -43707,61 +43882,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
43707
43882
|
}
|
|
43708
43883
|
if (withinTolerance) {
|
|
43709
43884
|
return {
|
|
43710
|
-
path:
|
|
43885
|
+
path: path40,
|
|
43711
43886
|
score: 1,
|
|
43712
43887
|
weight,
|
|
43713
43888
|
hit: true,
|
|
43714
|
-
message: `${
|
|
43889
|
+
message: `${path40} (within tolerance: diff=${diff.toFixed(2)})`
|
|
43715
43890
|
};
|
|
43716
43891
|
}
|
|
43717
43892
|
return {
|
|
43718
|
-
path:
|
|
43893
|
+
path: path40,
|
|
43719
43894
|
score: 0,
|
|
43720
43895
|
weight,
|
|
43721
43896
|
hit: false,
|
|
43722
|
-
message: `${
|
|
43897
|
+
message: `${path40} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
43723
43898
|
};
|
|
43724
43899
|
}
|
|
43725
43900
|
/**
|
|
43726
43901
|
* Date comparison with format normalization.
|
|
43727
43902
|
*/
|
|
43728
|
-
compareDate(
|
|
43903
|
+
compareDate(path40, candidateValue, expectedValue, fieldConfig, weight) {
|
|
43729
43904
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
43730
43905
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
43731
43906
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
43732
43907
|
if (candidateDate === null) {
|
|
43733
43908
|
return {
|
|
43734
|
-
path:
|
|
43909
|
+
path: path40,
|
|
43735
43910
|
score: 0,
|
|
43736
43911
|
weight,
|
|
43737
43912
|
hit: false,
|
|
43738
|
-
message: `${
|
|
43913
|
+
message: `${path40} (unparseable candidate date)`
|
|
43739
43914
|
};
|
|
43740
43915
|
}
|
|
43741
43916
|
if (expectedDate === null) {
|
|
43742
43917
|
return {
|
|
43743
|
-
path:
|
|
43918
|
+
path: path40,
|
|
43744
43919
|
score: 0,
|
|
43745
43920
|
weight,
|
|
43746
43921
|
hit: false,
|
|
43747
|
-
message: `${
|
|
43922
|
+
message: `${path40} (unparseable expected date)`
|
|
43748
43923
|
};
|
|
43749
43924
|
}
|
|
43750
43925
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
43751
43926
|
return {
|
|
43752
|
-
path:
|
|
43927
|
+
path: path40,
|
|
43753
43928
|
score: 1,
|
|
43754
43929
|
weight,
|
|
43755
43930
|
hit: true,
|
|
43756
|
-
message:
|
|
43931
|
+
message: path40
|
|
43757
43932
|
};
|
|
43758
43933
|
}
|
|
43759
43934
|
return {
|
|
43760
|
-
path:
|
|
43935
|
+
path: path40,
|
|
43761
43936
|
score: 0,
|
|
43762
43937
|
weight,
|
|
43763
43938
|
hit: false,
|
|
43764
|
-
message: `${
|
|
43939
|
+
message: `${path40} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
43765
43940
|
};
|
|
43766
43941
|
}
|
|
43767
43942
|
/**
|
|
@@ -43802,11 +43977,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
43802
43977
|
};
|
|
43803
43978
|
}
|
|
43804
43979
|
};
|
|
43805
|
-
function resolvePath(obj,
|
|
43806
|
-
if (!
|
|
43980
|
+
function resolvePath(obj, path40) {
|
|
43981
|
+
if (!path40 || !obj) {
|
|
43807
43982
|
return void 0;
|
|
43808
43983
|
}
|
|
43809
|
-
const parts =
|
|
43984
|
+
const parts = path40.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
43810
43985
|
let current = obj;
|
|
43811
43986
|
for (const part of parts) {
|
|
43812
43987
|
if (current === null || current === void 0) {
|
|
@@ -44610,8 +44785,8 @@ var TokenUsageEvaluator = class {
|
|
|
44610
44785
|
};
|
|
44611
44786
|
}
|
|
44612
44787
|
};
|
|
44613
|
-
function getNestedValue(obj,
|
|
44614
|
-
const parts =
|
|
44788
|
+
function getNestedValue(obj, path40) {
|
|
44789
|
+
const parts = path40.split(".");
|
|
44615
44790
|
let current = obj;
|
|
44616
44791
|
for (const part of parts) {
|
|
44617
44792
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -45073,13 +45248,78 @@ function runContainsAssertion(output, value) {
|
|
|
45073
45248
|
misses: passed ? [] : [`Output does not contain "${value}"`]
|
|
45074
45249
|
};
|
|
45075
45250
|
}
|
|
45076
|
-
function
|
|
45077
|
-
const
|
|
45251
|
+
function runContainsAnyAssertion(output, values) {
|
|
45252
|
+
const matched = values.filter((v) => output.includes(v));
|
|
45253
|
+
const passed = matched.length > 0;
|
|
45254
|
+
return {
|
|
45255
|
+
score: passed ? 1 : 0,
|
|
45256
|
+
hits: passed ? [`Output contains "${matched[0]}"`] : [],
|
|
45257
|
+
misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
|
|
45258
|
+
};
|
|
45259
|
+
}
|
|
45260
|
+
function runContainsAllAssertion(output, values) {
|
|
45261
|
+
const missing = values.filter((v) => !output.includes(v));
|
|
45262
|
+
const passed = missing.length === 0;
|
|
45263
|
+
return {
|
|
45264
|
+
score: passed ? 1 : 0,
|
|
45265
|
+
hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
|
|
45266
|
+
misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
45267
|
+
};
|
|
45268
|
+
}
|
|
45269
|
+
function runIcontainsAssertion(output, value) {
|
|
45270
|
+
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
45271
|
+
return {
|
|
45272
|
+
score: passed ? 1 : 0,
|
|
45273
|
+
hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
|
|
45274
|
+
misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
|
|
45275
|
+
};
|
|
45276
|
+
}
|
|
45277
|
+
function runIcontainsAnyAssertion(output, values) {
|
|
45278
|
+
const lower = output.toLowerCase();
|
|
45279
|
+
const matched = values.filter((v) => lower.includes(v.toLowerCase()));
|
|
45280
|
+
const passed = matched.length > 0;
|
|
45281
|
+
return {
|
|
45282
|
+
score: passed ? 1 : 0,
|
|
45283
|
+
hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
|
|
45284
|
+
misses: passed ? [] : [
|
|
45285
|
+
`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
|
|
45286
|
+
]
|
|
45287
|
+
};
|
|
45288
|
+
}
|
|
45289
|
+
function runIcontainsAllAssertion(output, values) {
|
|
45290
|
+
const lower = output.toLowerCase();
|
|
45291
|
+
const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
|
|
45292
|
+
const passed = missing.length === 0;
|
|
45293
|
+
return {
|
|
45294
|
+
score: passed ? 1 : 0,
|
|
45295
|
+
hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
|
|
45296
|
+
misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
|
|
45297
|
+
};
|
|
45298
|
+
}
|
|
45299
|
+
function runStartsWithAssertion(output, value) {
|
|
45300
|
+
const passed = output.trim().startsWith(value.trim());
|
|
45301
|
+
return {
|
|
45302
|
+
score: passed ? 1 : 0,
|
|
45303
|
+
hits: passed ? [`Output starts with "${value}"`] : [],
|
|
45304
|
+
misses: passed ? [] : [`Output does not start with "${value}"`]
|
|
45305
|
+
};
|
|
45306
|
+
}
|
|
45307
|
+
function runEndsWithAssertion(output, value) {
|
|
45308
|
+
const passed = output.trim().endsWith(value.trim());
|
|
45309
|
+
return {
|
|
45310
|
+
score: passed ? 1 : 0,
|
|
45311
|
+
hits: passed ? [`Output ends with "${value}"`] : [],
|
|
45312
|
+
misses: passed ? [] : [`Output does not end with "${value}"`]
|
|
45313
|
+
};
|
|
45314
|
+
}
|
|
45315
|
+
function runRegexAssertion(output, pattern, flags) {
|
|
45316
|
+
const regex = new RegExp(pattern, flags);
|
|
45078
45317
|
const passed = regex.test(output);
|
|
45318
|
+
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
45079
45319
|
return {
|
|
45080
45320
|
score: passed ? 1 : 0,
|
|
45081
|
-
hits: passed ? [`Output matches pattern /${pattern}
|
|
45082
|
-
misses: passed ? [] : [`Output does not match pattern /${pattern}
|
|
45321
|
+
hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
|
|
45322
|
+
misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
|
|
45083
45323
|
};
|
|
45084
45324
|
}
|
|
45085
45325
|
function runIsJsonAssertion(output) {
|
|
@@ -45477,13 +45717,13 @@ var containsFactory = (config2) => {
|
|
|
45477
45717
|
var regexFactory = (config2) => {
|
|
45478
45718
|
const c = config2;
|
|
45479
45719
|
return new DeterministicAssertionEvaluator("regex", (ctx) => {
|
|
45480
|
-
const result = runRegexAssertion(ctx.candidate, c.value);
|
|
45720
|
+
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
45481
45721
|
return {
|
|
45482
45722
|
score: result.score,
|
|
45483
45723
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
45484
45724
|
hits: result.hits,
|
|
45485
45725
|
misses: result.misses,
|
|
45486
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}
|
|
45726
|
+
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
45487
45727
|
expectedAspectCount: 1
|
|
45488
45728
|
};
|
|
45489
45729
|
});
|
|
@@ -45515,9 +45755,107 @@ var equalsFactory = (config2) => {
|
|
|
45515
45755
|
};
|
|
45516
45756
|
});
|
|
45517
45757
|
};
|
|
45758
|
+
var containsAnyFactory = (config2) => {
|
|
45759
|
+
const c = config2;
|
|
45760
|
+
return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
|
|
45761
|
+
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
45762
|
+
return {
|
|
45763
|
+
score: result.score,
|
|
45764
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45765
|
+
hits: result.hits,
|
|
45766
|
+
misses: result.misses,
|
|
45767
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45768
|
+
expectedAspectCount: 1
|
|
45769
|
+
};
|
|
45770
|
+
});
|
|
45771
|
+
};
|
|
45772
|
+
var containsAllFactory = (config2) => {
|
|
45773
|
+
const c = config2;
|
|
45774
|
+
return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
|
|
45775
|
+
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
45776
|
+
return {
|
|
45777
|
+
score: result.score,
|
|
45778
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45779
|
+
hits: result.hits,
|
|
45780
|
+
misses: result.misses,
|
|
45781
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45782
|
+
expectedAspectCount: 1
|
|
45783
|
+
};
|
|
45784
|
+
});
|
|
45785
|
+
};
|
|
45786
|
+
var icontainsFactory = (config2) => {
|
|
45787
|
+
const c = config2;
|
|
45788
|
+
return new DeterministicAssertionEvaluator("icontains", (ctx) => {
|
|
45789
|
+
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
45790
|
+
return {
|
|
45791
|
+
score: result.score,
|
|
45792
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45793
|
+
hits: result.hits,
|
|
45794
|
+
misses: result.misses,
|
|
45795
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45796
|
+
expectedAspectCount: 1
|
|
45797
|
+
};
|
|
45798
|
+
});
|
|
45799
|
+
};
|
|
45800
|
+
var icontainsAnyFactory = (config2) => {
|
|
45801
|
+
const c = config2;
|
|
45802
|
+
return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
|
|
45803
|
+
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
45804
|
+
return {
|
|
45805
|
+
score: result.score,
|
|
45806
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45807
|
+
hits: result.hits,
|
|
45808
|
+
misses: result.misses,
|
|
45809
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45810
|
+
expectedAspectCount: 1
|
|
45811
|
+
};
|
|
45812
|
+
});
|
|
45813
|
+
};
|
|
45814
|
+
var icontainsAllFactory = (config2) => {
|
|
45815
|
+
const c = config2;
|
|
45816
|
+
return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
|
|
45817
|
+
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
45818
|
+
return {
|
|
45819
|
+
score: result.score,
|
|
45820
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45821
|
+
hits: result.hits,
|
|
45822
|
+
misses: result.misses,
|
|
45823
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45824
|
+
expectedAspectCount: 1
|
|
45825
|
+
};
|
|
45826
|
+
});
|
|
45827
|
+
};
|
|
45828
|
+
var startsWithFactory = (config2) => {
|
|
45829
|
+
const c = config2;
|
|
45830
|
+
return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
|
|
45831
|
+
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
45832
|
+
return {
|
|
45833
|
+
score: result.score,
|
|
45834
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45835
|
+
hits: result.hits,
|
|
45836
|
+
misses: result.misses,
|
|
45837
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45838
|
+
expectedAspectCount: 1
|
|
45839
|
+
};
|
|
45840
|
+
});
|
|
45841
|
+
};
|
|
45842
|
+
var endsWithFactory = (config2) => {
|
|
45843
|
+
const c = config2;
|
|
45844
|
+
return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
|
|
45845
|
+
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
45846
|
+
return {
|
|
45847
|
+
score: result.score,
|
|
45848
|
+
verdict: result.score === 1 ? "pass" : "fail",
|
|
45849
|
+
hits: result.hits,
|
|
45850
|
+
misses: result.misses,
|
|
45851
|
+
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
45852
|
+
expectedAspectCount: 1
|
|
45853
|
+
};
|
|
45854
|
+
});
|
|
45855
|
+
};
|
|
45518
45856
|
function createBuiltinRegistry() {
|
|
45519
45857
|
const registry2 = new EvaluatorRegistry();
|
|
45520
|
-
registry2.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
45858
|
+
registry2.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
|
|
45521
45859
|
return registry2;
|
|
45522
45860
|
}
|
|
45523
45861
|
async function discoverAssertions(registry2, baseDir) {
|
|
@@ -45843,15 +46181,186 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
45843
46181
|
await rm4(evalDir, { recursive: true, force: true });
|
|
45844
46182
|
}
|
|
45845
46183
|
}
|
|
46184
|
+
var execFileAsync = promisify5(execFile);
|
|
46185
|
+
var DEFAULT_CACHE_DIR = path35.join(os4.homedir(), ".agentv", "git-cache");
|
|
46186
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
46187
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
46188
|
+
function gitEnv() {
|
|
46189
|
+
const env = { ...process.env };
|
|
46190
|
+
for (const key of Object.keys(env)) {
|
|
46191
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
46192
|
+
delete env[key];
|
|
46193
|
+
}
|
|
46194
|
+
}
|
|
46195
|
+
return {
|
|
46196
|
+
...env,
|
|
46197
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
46198
|
+
GIT_ASKPASS: "",
|
|
46199
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
46200
|
+
};
|
|
46201
|
+
}
|
|
46202
|
+
function cacheKey(source) {
|
|
46203
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
46204
|
+
return createHash("sha256").update(raw).digest("hex");
|
|
46205
|
+
}
|
|
46206
|
+
function getSourceUrl(source) {
|
|
46207
|
+
return source.type === "git" ? source.url : source.path;
|
|
46208
|
+
}
|
|
46209
|
+
async function git(args, opts) {
|
|
46210
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
46211
|
+
cwd: opts?.cwd,
|
|
46212
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
46213
|
+
env: gitEnv(),
|
|
46214
|
+
maxBuffer: 50 * 1024 * 1024
|
|
46215
|
+
// 50MB
|
|
46216
|
+
});
|
|
46217
|
+
return stdout.trim();
|
|
46218
|
+
}
|
|
46219
|
+
async function acquireLock(lockPath) {
|
|
46220
|
+
const start = Date.now();
|
|
46221
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
46222
|
+
try {
|
|
46223
|
+
await writeFile7(lockPath, String(process.pid), { flag: "wx" });
|
|
46224
|
+
return;
|
|
46225
|
+
} catch (err) {
|
|
46226
|
+
if (err.code === "EEXIST") {
|
|
46227
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
46228
|
+
continue;
|
|
46229
|
+
}
|
|
46230
|
+
throw err;
|
|
46231
|
+
}
|
|
46232
|
+
}
|
|
46233
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
46234
|
+
}
|
|
46235
|
+
async function releaseLock(lockPath) {
|
|
46236
|
+
try {
|
|
46237
|
+
await unlink(lockPath);
|
|
46238
|
+
} catch {
|
|
46239
|
+
}
|
|
46240
|
+
}
|
|
46241
|
+
var RepoManager = class {
|
|
46242
|
+
cacheDir;
|
|
46243
|
+
constructor(cacheDir) {
|
|
46244
|
+
this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
|
|
46245
|
+
}
|
|
46246
|
+
/**
|
|
46247
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
46248
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
46249
|
+
* Returns the absolute path to the cache directory.
|
|
46250
|
+
*/
|
|
46251
|
+
async ensureCache(source) {
|
|
46252
|
+
const key = cacheKey(source);
|
|
46253
|
+
const cachePath = path35.join(this.cacheDir, key);
|
|
46254
|
+
const lockPath = `${cachePath}.lock`;
|
|
46255
|
+
await mkdir11(this.cacheDir, { recursive: true });
|
|
46256
|
+
await acquireLock(lockPath);
|
|
46257
|
+
try {
|
|
46258
|
+
if (existsSync2(path35.join(cachePath, "HEAD"))) {
|
|
46259
|
+
await git(["fetch", "--prune"], { cwd: cachePath });
|
|
46260
|
+
} else {
|
|
46261
|
+
await git(["clone", "--mirror", "--bare", getSourceUrl(source), cachePath]);
|
|
46262
|
+
}
|
|
46263
|
+
} finally {
|
|
46264
|
+
await releaseLock(lockPath);
|
|
46265
|
+
}
|
|
46266
|
+
return cachePath;
|
|
46267
|
+
}
|
|
46268
|
+
/**
|
|
46269
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
46270
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
46271
|
+
*/
|
|
46272
|
+
async materialize(repo, workspacePath) {
|
|
46273
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
46274
|
+
const cachePath = await this.ensureCache(repo.source);
|
|
46275
|
+
const cloneArgs = ["clone"];
|
|
46276
|
+
if (repo.clone?.depth) {
|
|
46277
|
+
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
46278
|
+
}
|
|
46279
|
+
if (repo.clone?.filter) {
|
|
46280
|
+
cloneArgs.push("--filter", repo.clone.filter);
|
|
46281
|
+
}
|
|
46282
|
+
cloneArgs.push("--no-checkout");
|
|
46283
|
+
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
|
|
46284
|
+
cloneArgs.push(cloneUrl, targetDir);
|
|
46285
|
+
await git(cloneArgs);
|
|
46286
|
+
if (repo.clone?.sparse?.length) {
|
|
46287
|
+
await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
|
|
46288
|
+
await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
|
|
46289
|
+
}
|
|
46290
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
46291
|
+
const resolve2 = repo.checkout?.resolve ?? "remote";
|
|
46292
|
+
let resolvedSha;
|
|
46293
|
+
if (resolve2 === "remote" && repo.source.type === "git") {
|
|
46294
|
+
const url2 = getSourceUrl(repo.source);
|
|
46295
|
+
try {
|
|
46296
|
+
const lsOutput = await git(["ls-remote", url2, ref]);
|
|
46297
|
+
const match = lsOutput.split(" ")[0];
|
|
46298
|
+
if (!match) {
|
|
46299
|
+
throw new Error(`Ref '${ref}' not found on remote ${url2}`);
|
|
46300
|
+
}
|
|
46301
|
+
resolvedSha = match;
|
|
46302
|
+
} catch (err) {
|
|
46303
|
+
if (err instanceof Error && err.message.includes("not found")) throw err;
|
|
46304
|
+
resolvedSha = ref;
|
|
46305
|
+
}
|
|
46306
|
+
} else {
|
|
46307
|
+
resolvedSha = ref;
|
|
46308
|
+
}
|
|
46309
|
+
await git(["checkout", resolvedSha], { cwd: targetDir });
|
|
46310
|
+
const ancestor = repo.checkout?.ancestor ?? 0;
|
|
46311
|
+
if (ancestor > 0) {
|
|
46312
|
+
try {
|
|
46313
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
46314
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
46315
|
+
} catch {
|
|
46316
|
+
if (repo.clone?.depth) {
|
|
46317
|
+
await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
|
|
46318
|
+
const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
|
|
46319
|
+
await git(["checkout", ancestorSha], { cwd: targetDir });
|
|
46320
|
+
} else {
|
|
46321
|
+
throw new Error(
|
|
46322
|
+
`Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
|
|
46323
|
+
);
|
|
46324
|
+
}
|
|
46325
|
+
}
|
|
46326
|
+
}
|
|
46327
|
+
}
|
|
46328
|
+
/** Materialize all repos into the workspace. */
|
|
46329
|
+
async materializeAll(repos, workspacePath) {
|
|
46330
|
+
for (const repo of repos) {
|
|
46331
|
+
await this.materialize(repo, workspacePath);
|
|
46332
|
+
}
|
|
46333
|
+
}
|
|
46334
|
+
/** Reset repos in workspace to their checkout state. */
|
|
46335
|
+
async reset(repos, workspacePath, strategy) {
|
|
46336
|
+
if (strategy === "recreate") {
|
|
46337
|
+
for (const repo of repos) {
|
|
46338
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
46339
|
+
await rm5(targetDir, { recursive: true, force: true });
|
|
46340
|
+
}
|
|
46341
|
+
await this.materializeAll(repos, workspacePath);
|
|
46342
|
+
return;
|
|
46343
|
+
}
|
|
46344
|
+
for (const repo of repos) {
|
|
46345
|
+
const targetDir = path35.join(workspacePath, repo.path);
|
|
46346
|
+
await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
46347
|
+
await git(["clean", "-fd"], { cwd: targetDir });
|
|
46348
|
+
}
|
|
46349
|
+
}
|
|
46350
|
+
/** Remove the entire cache directory. */
|
|
46351
|
+
async cleanCache() {
|
|
46352
|
+
await rm5(this.cacheDir, { recursive: true, force: true });
|
|
46353
|
+
}
|
|
46354
|
+
};
|
|
45846
46355
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
45847
46356
|
if (!templatePath) {
|
|
45848
46357
|
return void 0;
|
|
45849
46358
|
}
|
|
45850
|
-
const resolved =
|
|
46359
|
+
const resolved = path36.resolve(templatePath);
|
|
45851
46360
|
const stats = await stat6(resolved);
|
|
45852
46361
|
if (stats.isFile()) {
|
|
45853
46362
|
return {
|
|
45854
|
-
dir:
|
|
46363
|
+
dir: path36.dirname(resolved),
|
|
45855
46364
|
workspaceFile: resolved
|
|
45856
46365
|
};
|
|
45857
46366
|
}
|
|
@@ -45863,14 +46372,14 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
45863
46372
|
if (workspaceFiles.length === 1) {
|
|
45864
46373
|
return {
|
|
45865
46374
|
dir: resolved,
|
|
45866
|
-
workspaceFile:
|
|
46375
|
+
workspaceFile: path36.join(resolved, workspaceFiles[0])
|
|
45867
46376
|
};
|
|
45868
46377
|
}
|
|
45869
46378
|
if (workspaceFiles.length > 1) {
|
|
45870
46379
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
45871
46380
|
return {
|
|
45872
46381
|
dir: resolved,
|
|
45873
|
-
workspaceFile: conventionFile ?
|
|
46382
|
+
workspaceFile: conventionFile ? path36.join(resolved, conventionFile) : void 0
|
|
45874
46383
|
};
|
|
45875
46384
|
}
|
|
45876
46385
|
return { dir: resolved };
|
|
@@ -45988,6 +46497,11 @@ async function runEvaluation(options) {
|
|
|
45988
46497
|
}
|
|
45989
46498
|
return getOrCreateProvider(resolvedJudge);
|
|
45990
46499
|
};
|
|
46500
|
+
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
46501
|
+
throw new Error(
|
|
46502
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
|
|
46503
|
+
);
|
|
46504
|
+
}
|
|
45991
46505
|
const targetResolver = (name16) => {
|
|
45992
46506
|
const resolved = resolveTargetByName(name16);
|
|
45993
46507
|
if (!resolved) {
|
|
@@ -46001,7 +46515,7 @@ async function runEvaluation(options) {
|
|
|
46001
46515
|
];
|
|
46002
46516
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
46003
46517
|
const typeRegistry = createBuiltinRegistry();
|
|
46004
|
-
const discoveryBaseDir = evalFilePath ?
|
|
46518
|
+
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
46005
46519
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
46006
46520
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
46007
46521
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -46056,7 +46570,8 @@ async function runEvaluation(options) {
|
|
|
46056
46570
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
46057
46571
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
46058
46572
|
const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
46059
|
-
const
|
|
46573
|
+
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
46574
|
+
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
46060
46575
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
46061
46576
|
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
46062
46577
|
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
@@ -46075,9 +46590,22 @@ async function runEvaluation(options) {
|
|
|
46075
46590
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
46076
46591
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
46077
46592
|
}
|
|
46078
|
-
} else if (suiteWorkspace?.before_all) {
|
|
46593
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
46079
46594
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
46080
|
-
await
|
|
46595
|
+
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
46596
|
+
}
|
|
46597
|
+
const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
|
|
46598
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
46599
|
+
try {
|
|
46600
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
46601
|
+
} catch (error40) {
|
|
46602
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
46603
|
+
if (sharedWorkspacePath) {
|
|
46604
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
46605
|
+
});
|
|
46606
|
+
}
|
|
46607
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
46608
|
+
}
|
|
46081
46609
|
}
|
|
46082
46610
|
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
46083
46611
|
const scriptContext = {
|
|
@@ -46168,7 +46696,8 @@ async function runEvaluation(options) {
|
|
|
46168
46696
|
sharedBaselineCommit,
|
|
46169
46697
|
suiteWorkspaceFile,
|
|
46170
46698
|
streamCallbacks,
|
|
46171
|
-
typeRegistry
|
|
46699
|
+
typeRegistry,
|
|
46700
|
+
repoManager
|
|
46172
46701
|
};
|
|
46173
46702
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
46174
46703
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -46443,15 +46972,16 @@ async function runEvalCase(options) {
|
|
|
46443
46972
|
sharedWorkspacePath,
|
|
46444
46973
|
sharedBaselineCommit,
|
|
46445
46974
|
suiteWorkspaceFile,
|
|
46446
|
-
typeRegistry: providedTypeRegistry
|
|
46975
|
+
typeRegistry: providedTypeRegistry,
|
|
46976
|
+
repoManager
|
|
46447
46977
|
} = options;
|
|
46448
46978
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
46449
46979
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
46450
46980
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
46451
|
-
const
|
|
46981
|
+
const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
46452
46982
|
let cachedResponse;
|
|
46453
|
-
if (
|
|
46454
|
-
cachedResponse = await cache.get(
|
|
46983
|
+
if (cacheKey2 && cache) {
|
|
46984
|
+
cachedResponse = await cache.get(cacheKey2);
|
|
46455
46985
|
}
|
|
46456
46986
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
46457
46987
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -46480,9 +47010,25 @@ async function runEvalCase(options) {
|
|
|
46480
47010
|
);
|
|
46481
47011
|
}
|
|
46482
47012
|
}
|
|
46483
|
-
if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
|
|
47013
|
+
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
46484
47014
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
46485
|
-
await
|
|
47015
|
+
await mkdir12(workspacePath, { recursive: true });
|
|
47016
|
+
}
|
|
47017
|
+
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
47018
|
+
const perCaseRepoManager = new RepoManager();
|
|
47019
|
+
try {
|
|
47020
|
+
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
|
|
47021
|
+
} catch (error40) {
|
|
47022
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
47023
|
+
return buildErrorResult(
|
|
47024
|
+
evalCase,
|
|
47025
|
+
target.name,
|
|
47026
|
+
nowFn(),
|
|
47027
|
+
new Error(`Failed to materialize repos: ${message}`),
|
|
47028
|
+
promptInputs,
|
|
47029
|
+
provider
|
|
47030
|
+
);
|
|
47031
|
+
}
|
|
46486
47032
|
}
|
|
46487
47033
|
if (workspacePath && evalCase.workspace?.before_all) {
|
|
46488
47034
|
const scriptContext = {
|
|
@@ -46606,8 +47152,8 @@ async function runEvalCase(options) {
|
|
|
46606
47152
|
}
|
|
46607
47153
|
return errorResult;
|
|
46608
47154
|
}
|
|
46609
|
-
if (
|
|
46610
|
-
await cache.set(
|
|
47155
|
+
if (cacheKey2 && cache && !cachedResponse) {
|
|
47156
|
+
await cache.set(cacheKey2, providerResponse);
|
|
46611
47157
|
}
|
|
46612
47158
|
const output = providerResponse.output;
|
|
46613
47159
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -46635,6 +47181,16 @@ async function runEvalCase(options) {
|
|
|
46635
47181
|
}
|
|
46636
47182
|
}
|
|
46637
47183
|
const providerError = extractProviderError(providerResponse);
|
|
47184
|
+
if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
|
|
47185
|
+
try {
|
|
47186
|
+
await repoManager.reset(
|
|
47187
|
+
evalCase.workspace.repos,
|
|
47188
|
+
workspacePath,
|
|
47189
|
+
evalCase.workspace.reset.strategy
|
|
47190
|
+
);
|
|
47191
|
+
} catch {
|
|
47192
|
+
}
|
|
47193
|
+
}
|
|
46638
47194
|
if (workspacePath && evalCase.workspace?.after_each) {
|
|
46639
47195
|
const scriptContext = {
|
|
46640
47196
|
workspacePath,
|
|
@@ -46999,7 +47555,7 @@ async function runEvaluatorList(options) {
|
|
|
46999
47555
|
fileChanges,
|
|
47000
47556
|
workspacePath
|
|
47001
47557
|
};
|
|
47002
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
47558
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path37.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
47003
47559
|
const dispatchContext = {
|
|
47004
47560
|
judgeProvider,
|
|
47005
47561
|
targetResolver,
|
|
@@ -47089,8 +47645,9 @@ async function runEvaluatorList(options) {
|
|
|
47089
47645
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
47090
47646
|
return entry.score.score < minScore;
|
|
47091
47647
|
});
|
|
47092
|
-
const
|
|
47093
|
-
|
|
47648
|
+
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
47649
|
+
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
47650
|
+
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
47094
47651
|
) : 0;
|
|
47095
47652
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
47096
47653
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
@@ -47230,7 +47787,7 @@ function extractProviderError(response) {
|
|
|
47230
47787
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
47231
47788
|
}
|
|
47232
47789
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
47233
|
-
const hash =
|
|
47790
|
+
const hash = createHash2("sha256");
|
|
47234
47791
|
hash.update(provider.id);
|
|
47235
47792
|
hash.update(target.name);
|
|
47236
47793
|
hash.update(evalCase.id);
|
|
@@ -47317,13 +47874,13 @@ async function evaluate(config2) {
|
|
|
47317
47874
|
let evalCases;
|
|
47318
47875
|
let testFilePath;
|
|
47319
47876
|
if (config2.specFile) {
|
|
47320
|
-
testFilePath =
|
|
47877
|
+
testFilePath = path38.resolve(config2.specFile);
|
|
47321
47878
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
47322
47879
|
verbose: config2.verbose,
|
|
47323
47880
|
filter: config2.filter
|
|
47324
47881
|
});
|
|
47325
47882
|
} else {
|
|
47326
|
-
testFilePath =
|
|
47883
|
+
testFilePath = path38.join(process.cwd(), "__programmatic__.yaml");
|
|
47327
47884
|
evalCases = (config2.tests ?? []).map((test) => {
|
|
47328
47885
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
47329
47886
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -47414,11 +47971,11 @@ function computeSummary(results, durationMs) {
|
|
|
47414
47971
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
47415
47972
|
async function discoverDefaultTarget(repoRoot) {
|
|
47416
47973
|
const cwd = process.cwd();
|
|
47417
|
-
const chain = buildDirectoryChain(
|
|
47974
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
47418
47975
|
for (const dir of chain) {
|
|
47419
47976
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
47420
|
-
const targetsPath =
|
|
47421
|
-
if (!
|
|
47977
|
+
const targetsPath = path38.join(dir, candidate);
|
|
47978
|
+
if (!existsSync3(targetsPath)) continue;
|
|
47422
47979
|
try {
|
|
47423
47980
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
47424
47981
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -47432,11 +47989,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
47432
47989
|
async function loadEnvHierarchy(repoRoot) {
|
|
47433
47990
|
const { readFileSync: readFileSync2 } = await import("node:fs");
|
|
47434
47991
|
const cwd = process.cwd();
|
|
47435
|
-
const chain = buildDirectoryChain(
|
|
47992
|
+
const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
|
|
47436
47993
|
const envFiles = [];
|
|
47437
47994
|
for (const dir of chain) {
|
|
47438
|
-
const envPath =
|
|
47439
|
-
if (
|
|
47995
|
+
const envPath = path38.join(dir, ".env");
|
|
47996
|
+
if (existsSync3(envPath)) envFiles.push(envPath);
|
|
47440
47997
|
}
|
|
47441
47998
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
47442
47999
|
try {
|
|
@@ -47503,12 +48060,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
47503
48060
|
".agentv/config.js"
|
|
47504
48061
|
];
|
|
47505
48062
|
async function loadTsConfig(projectRoot) {
|
|
47506
|
-
const { existsSync:
|
|
48063
|
+
const { existsSync: existsSync4 } = await import("node:fs");
|
|
47507
48064
|
const { pathToFileURL } = await import("node:url");
|
|
47508
48065
|
const { join: join2 } = await import("node:path");
|
|
47509
48066
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
47510
48067
|
const filePath = join2(projectRoot, fileName);
|
|
47511
|
-
if (!
|
|
48068
|
+
if (!existsSync4(filePath)) {
|
|
47512
48069
|
continue;
|
|
47513
48070
|
}
|
|
47514
48071
|
try {
|
|
@@ -47616,13 +48173,13 @@ var ResponseCache = class {
|
|
|
47616
48173
|
}
|
|
47617
48174
|
async set(key, value) {
|
|
47618
48175
|
const filePath = this.keyToPath(key);
|
|
47619
|
-
const dir =
|
|
47620
|
-
await
|
|
47621
|
-
await
|
|
48176
|
+
const dir = path39.dirname(filePath);
|
|
48177
|
+
await mkdir13(dir, { recursive: true });
|
|
48178
|
+
await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
47622
48179
|
}
|
|
47623
48180
|
keyToPath(key) {
|
|
47624
48181
|
const prefix = key.slice(0, 2);
|
|
47625
|
-
return
|
|
48182
|
+
return path39.join(this.cachePath, prefix, `${key}.json`);
|
|
47626
48183
|
}
|
|
47627
48184
|
};
|
|
47628
48185
|
function shouldEnableCache(params) {
|
|
@@ -48163,6 +48720,13 @@ export {
|
|
|
48163
48720
|
TokenUsageEvaluator,
|
|
48164
48721
|
ToolTrajectoryEvaluator,
|
|
48165
48722
|
runContainsAssertion,
|
|
48723
|
+
runContainsAnyAssertion,
|
|
48724
|
+
runContainsAllAssertion,
|
|
48725
|
+
runIcontainsAssertion,
|
|
48726
|
+
runIcontainsAnyAssertion,
|
|
48727
|
+
runIcontainsAllAssertion,
|
|
48728
|
+
runStartsWithAssertion,
|
|
48729
|
+
runEndsWithAssertion,
|
|
48166
48730
|
runRegexAssertion,
|
|
48167
48731
|
runIsJsonAssertion,
|
|
48168
48732
|
runEqualsAssertion,
|
|
@@ -48179,6 +48743,7 @@ export {
|
|
|
48179
48743
|
createTempWorkspace,
|
|
48180
48744
|
cleanupWorkspace,
|
|
48181
48745
|
cleanupEvalWorkspaces,
|
|
48746
|
+
RepoManager,
|
|
48182
48747
|
resolveWorkspaceTemplate,
|
|
48183
48748
|
executeWorkspaceScript,
|
|
48184
48749
|
runEvaluation,
|
|
@@ -48196,4 +48761,4 @@ export {
|
|
|
48196
48761
|
OtelStreamingObserver,
|
|
48197
48762
|
createAgentKernel
|
|
48198
48763
|
};
|
|
48199
|
-
//# sourceMappingURL=chunk-
|
|
48764
|
+
//# sourceMappingURL=chunk-EXJWRKKL.js.map
|