agentv 0.20.1 → 0.21.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -0
- package/dist/{chunk-GDGNKNKP.js → chunk-WOCXZEH4.js} +806 -230
- package/dist/chunk-WOCXZEH4.js.map +1 -0
- package/dist/cli.js +5 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +3 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +3 -3
- package/dist/templates/agentv/.env.template +23 -0
- package/package.json +4 -4
- package/dist/chunk-GDGNKNKP.js.map +0 -1
|
@@ -142,11 +142,20 @@ var require_dist = __commonJS({
|
|
|
142
142
|
|
|
143
143
|
// src/index.ts
|
|
144
144
|
import { readFileSync as readFileSync2 } from "node:fs";
|
|
145
|
-
import {
|
|
145
|
+
import { binary, run, subcommands as subcommands2 } from "cmd-ts";
|
|
146
146
|
|
|
147
147
|
// src/commands/eval/index.ts
|
|
148
148
|
import { stat as stat4 } from "node:fs/promises";
|
|
149
149
|
import path19 from "node:path";
|
|
150
|
+
import {
|
|
151
|
+
command,
|
|
152
|
+
flag,
|
|
153
|
+
number as number4,
|
|
154
|
+
option,
|
|
155
|
+
optional as optional2,
|
|
156
|
+
restPositionals,
|
|
157
|
+
string as string4
|
|
158
|
+
} from "cmd-ts";
|
|
150
159
|
import fg from "fast-glob";
|
|
151
160
|
|
|
152
161
|
// src/commands/eval/run-eval.ts
|
|
@@ -155,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
|
155
164
|
import path18 from "node:path";
|
|
156
165
|
import { pathToFileURL } from "node:url";
|
|
157
166
|
|
|
158
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-B2J23S7D.js
|
|
159
168
|
import { constants } from "node:fs";
|
|
160
169
|
import { access, readFile } from "node:fs/promises";
|
|
161
170
|
import path from "node:path";
|
|
@@ -638,8 +647,8 @@ function getErrorMap() {
|
|
|
638
647
|
|
|
639
648
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
640
649
|
var makeIssue = (params) => {
|
|
641
|
-
const { data, path:
|
|
642
|
-
const fullPath = [...
|
|
650
|
+
const { data, path: path26, errorMaps, issueData } = params;
|
|
651
|
+
const fullPath = [...path26, ...issueData.path || []];
|
|
643
652
|
const fullIssue = {
|
|
644
653
|
...issueData,
|
|
645
654
|
path: fullPath
|
|
@@ -755,11 +764,11 @@ var errorUtil;
|
|
|
755
764
|
|
|
756
765
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
757
766
|
var ParseInputLazyPath = class {
|
|
758
|
-
constructor(parent, value,
|
|
767
|
+
constructor(parent, value, path26, key2) {
|
|
759
768
|
this._cachedPath = [];
|
|
760
769
|
this.parent = parent;
|
|
761
770
|
this.data = value;
|
|
762
|
-
this._path =
|
|
771
|
+
this._path = path26;
|
|
763
772
|
this._key = key2;
|
|
764
773
|
}
|
|
765
774
|
get path() {
|
|
@@ -1039,8 +1048,8 @@ var ZodType = class {
|
|
|
1039
1048
|
promise() {
|
|
1040
1049
|
return ZodPromise.create(this, this._def);
|
|
1041
1050
|
}
|
|
1042
|
-
or(
|
|
1043
|
-
return ZodUnion.create([this,
|
|
1051
|
+
or(option4) {
|
|
1052
|
+
return ZodUnion.create([this, option4], this._def);
|
|
1044
1053
|
}
|
|
1045
1054
|
and(incoming) {
|
|
1046
1055
|
return ZodIntersection.create(this, incoming, this._def);
|
|
@@ -2890,7 +2899,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2890
2899
|
return INVALID;
|
|
2891
2900
|
}
|
|
2892
2901
|
if (ctx.common.async) {
|
|
2893
|
-
return Promise.all(options.map(async (
|
|
2902
|
+
return Promise.all(options.map(async (option4) => {
|
|
2894
2903
|
const childCtx = {
|
|
2895
2904
|
...ctx,
|
|
2896
2905
|
common: {
|
|
@@ -2900,7 +2909,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2900
2909
|
parent: null
|
|
2901
2910
|
};
|
|
2902
2911
|
return {
|
|
2903
|
-
result: await
|
|
2912
|
+
result: await option4._parseAsync({
|
|
2904
2913
|
data: ctx.data,
|
|
2905
2914
|
path: ctx.path,
|
|
2906
2915
|
parent: childCtx
|
|
@@ -2911,7 +2920,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2911
2920
|
} else {
|
|
2912
2921
|
let dirty = void 0;
|
|
2913
2922
|
const issues = [];
|
|
2914
|
-
for (const
|
|
2923
|
+
for (const option4 of options) {
|
|
2915
2924
|
const childCtx = {
|
|
2916
2925
|
...ctx,
|
|
2917
2926
|
common: {
|
|
@@ -2920,7 +2929,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2920
2929
|
},
|
|
2921
2930
|
parent: null
|
|
2922
2931
|
};
|
|
2923
|
-
const result =
|
|
2932
|
+
const result = option4._parseSync({
|
|
2924
2933
|
data: ctx.data,
|
|
2925
2934
|
path: ctx.path,
|
|
2926
2935
|
parent: childCtx
|
|
@@ -3001,8 +3010,8 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3001
3010
|
}
|
|
3002
3011
|
const discriminator = this.discriminator;
|
|
3003
3012
|
const discriminatorValue = ctx.data[discriminator];
|
|
3004
|
-
const
|
|
3005
|
-
if (!
|
|
3013
|
+
const option4 = this.optionsMap.get(discriminatorValue);
|
|
3014
|
+
if (!option4) {
|
|
3006
3015
|
addIssueToContext(ctx, {
|
|
3007
3016
|
code: ZodIssueCode.invalid_union_discriminator,
|
|
3008
3017
|
options: Array.from(this.optionsMap.keys()),
|
|
@@ -3011,13 +3020,13 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3011
3020
|
return INVALID;
|
|
3012
3021
|
}
|
|
3013
3022
|
if (ctx.common.async) {
|
|
3014
|
-
return
|
|
3023
|
+
return option4._parseAsync({
|
|
3015
3024
|
data: ctx.data,
|
|
3016
3025
|
path: ctx.path,
|
|
3017
3026
|
parent: ctx
|
|
3018
3027
|
});
|
|
3019
3028
|
} else {
|
|
3020
|
-
return
|
|
3029
|
+
return option4._parseSync({
|
|
3021
3030
|
data: ctx.data,
|
|
3022
3031
|
path: ctx.path,
|
|
3023
3032
|
parent: ctx
|
|
@@ -4201,7 +4210,7 @@ var coerce = {
|
|
|
4201
4210
|
};
|
|
4202
4211
|
var NEVER = INVALID;
|
|
4203
4212
|
|
|
4204
|
-
// ../../packages/core/dist/chunk-
|
|
4213
|
+
// ../../packages/core/dist/chunk-B2J23S7D.js
|
|
4205
4214
|
async function fileExists(filePath) {
|
|
4206
4215
|
try {
|
|
4207
4216
|
await access(filePath, constants.F_OK);
|
|
@@ -4577,9 +4586,9 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4577
4586
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
4578
4587
|
const subagentRootSource = target.subagent_root ?? target.subagentRoot;
|
|
4579
4588
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
4580
|
-
const
|
|
4589
|
+
const command5 = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
4581
4590
|
return {
|
|
4582
|
-
command,
|
|
4591
|
+
command: command5,
|
|
4583
4592
|
waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
|
|
4584
4593
|
dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
|
|
4585
4594
|
subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
|
|
@@ -5976,10 +5985,10 @@ function assignProp(target, prop, value) {
|
|
|
5976
5985
|
configurable: true
|
|
5977
5986
|
});
|
|
5978
5987
|
}
|
|
5979
|
-
function getElementAtPath(obj,
|
|
5980
|
-
if (!
|
|
5988
|
+
function getElementAtPath(obj, path26) {
|
|
5989
|
+
if (!path26)
|
|
5981
5990
|
return obj;
|
|
5982
|
-
return
|
|
5991
|
+
return path26.reduce((acc, key2) => acc?.[key2], obj);
|
|
5983
5992
|
}
|
|
5984
5993
|
function promiseAllObject(promisesObj) {
|
|
5985
5994
|
const keys = Object.keys(promisesObj);
|
|
@@ -6299,11 +6308,11 @@ function aborted(x, startIndex = 0) {
|
|
|
6299
6308
|
}
|
|
6300
6309
|
return false;
|
|
6301
6310
|
}
|
|
6302
|
-
function prefixIssues(
|
|
6311
|
+
function prefixIssues(path26, issues) {
|
|
6303
6312
|
return issues.map((iss) => {
|
|
6304
6313
|
var _a17;
|
|
6305
6314
|
(_a17 = iss).path ?? (_a17.path = []);
|
|
6306
|
-
iss.path.unshift(
|
|
6315
|
+
iss.path.unshift(path26);
|
|
6307
6316
|
return iss;
|
|
6308
6317
|
});
|
|
6309
6318
|
}
|
|
@@ -6440,7 +6449,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6440
6449
|
return issue2.message;
|
|
6441
6450
|
};
|
|
6442
6451
|
const result = { errors: [] };
|
|
6443
|
-
const processError = (error41,
|
|
6452
|
+
const processError = (error41, path26 = []) => {
|
|
6444
6453
|
var _a17, _b8;
|
|
6445
6454
|
for (const issue2 of error41.issues) {
|
|
6446
6455
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -6450,7 +6459,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6450
6459
|
} else if (issue2.code === "invalid_element") {
|
|
6451
6460
|
processError({ issues: issue2.issues }, issue2.path);
|
|
6452
6461
|
} else {
|
|
6453
|
-
const fullpath = [...
|
|
6462
|
+
const fullpath = [...path26, ...issue2.path];
|
|
6454
6463
|
if (fullpath.length === 0) {
|
|
6455
6464
|
result.errors.push(mapper(issue2));
|
|
6456
6465
|
continue;
|
|
@@ -6480,9 +6489,9 @@ function treeifyError(error40, _mapper) {
|
|
|
6480
6489
|
processError(error40);
|
|
6481
6490
|
return result;
|
|
6482
6491
|
}
|
|
6483
|
-
function toDotPath(
|
|
6492
|
+
function toDotPath(path26) {
|
|
6484
6493
|
const segs = [];
|
|
6485
|
-
for (const seg of
|
|
6494
|
+
for (const seg of path26) {
|
|
6486
6495
|
if (typeof seg === "number")
|
|
6487
6496
|
segs.push(`[${seg}]`);
|
|
6488
6497
|
else if (typeof seg === "symbol")
|
|
@@ -8081,7 +8090,7 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8081
8090
|
defineLazy(inst._zod, "optout", () => def.options.some((o) => o._zod.optout === "optional") ? "optional" : void 0);
|
|
8082
8091
|
defineLazy(inst._zod, "values", () => {
|
|
8083
8092
|
if (def.options.every((o) => o._zod.values)) {
|
|
8084
|
-
return new Set(def.options.flatMap((
|
|
8093
|
+
return new Set(def.options.flatMap((option4) => Array.from(option4._zod.values)));
|
|
8085
8094
|
}
|
|
8086
8095
|
return void 0;
|
|
8087
8096
|
});
|
|
@@ -8095,8 +8104,8 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8095
8104
|
inst._zod.parse = (payload, ctx) => {
|
|
8096
8105
|
let async = false;
|
|
8097
8106
|
const results = [];
|
|
8098
|
-
for (const
|
|
8099
|
-
const result =
|
|
8107
|
+
for (const option4 of def.options) {
|
|
8108
|
+
const result = option4._zod.run({
|
|
8100
8109
|
value: payload.value,
|
|
8101
8110
|
issues: []
|
|
8102
8111
|
}, ctx);
|
|
@@ -8121,10 +8130,10 @@ var $ZodDiscriminatedUnion = /* @__PURE__ */ $constructor("$ZodDiscriminatedUnio
|
|
|
8121
8130
|
const _super = inst._zod.parse;
|
|
8122
8131
|
defineLazy(inst._zod, "propValues", () => {
|
|
8123
8132
|
const propValues = {};
|
|
8124
|
-
for (const
|
|
8125
|
-
const pv =
|
|
8133
|
+
for (const option4 of def.options) {
|
|
8134
|
+
const pv = option4._zod.propValues;
|
|
8126
8135
|
if (!pv || Object.keys(pv).length === 0)
|
|
8127
|
-
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(
|
|
8136
|
+
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option4)}"`);
|
|
8128
8137
|
for (const [k, v] of Object.entries(pv)) {
|
|
8129
8138
|
if (!propValues[k])
|
|
8130
8139
|
propValues[k] = /* @__PURE__ */ new Set();
|
|
@@ -15328,8 +15337,8 @@ function isTransforming(_schema, _ctx) {
|
|
|
15328
15337
|
return false;
|
|
15329
15338
|
}
|
|
15330
15339
|
case "union": {
|
|
15331
|
-
for (const
|
|
15332
|
-
if (isTransforming(
|
|
15340
|
+
for (const option4 of def.options) {
|
|
15341
|
+
if (isTransforming(option4, ctx))
|
|
15333
15342
|
return true;
|
|
15334
15343
|
}
|
|
15335
15344
|
return false;
|
|
@@ -26035,14 +26044,14 @@ function createAzure(options = {}) {
|
|
|
26035
26044
|
description: "Azure OpenAI resource name"
|
|
26036
26045
|
});
|
|
26037
26046
|
const apiVersion = (_a17 = options.apiVersion) != null ? _a17 : "v1";
|
|
26038
|
-
const url2 = ({ path:
|
|
26047
|
+
const url2 = ({ path: path26, modelId }) => {
|
|
26039
26048
|
var _a24;
|
|
26040
26049
|
const baseUrlPrefix = (_a24 = options.baseURL) != null ? _a24 : `https://${getResourceName()}.openai.azure.com/openai`;
|
|
26041
26050
|
let fullUrl;
|
|
26042
26051
|
if (options.useDeploymentBasedUrls) {
|
|
26043
|
-
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${
|
|
26052
|
+
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${path26}`);
|
|
26044
26053
|
} else {
|
|
26045
|
-
fullUrl = new URL(`${baseUrlPrefix}/v1${
|
|
26054
|
+
fullUrl = new URL(`${baseUrlPrefix}/v1${path26}`);
|
|
26046
26055
|
}
|
|
26047
26056
|
fullUrl.searchParams.set("api-version", apiVersion);
|
|
26048
26057
|
return fullUrl.toString();
|
|
@@ -34553,7 +34562,7 @@ function isTestMessage(value) {
|
|
|
34553
34562
|
}
|
|
34554
34563
|
return candidate.content.every(isJsonObject);
|
|
34555
34564
|
}
|
|
34556
|
-
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
34565
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
|
|
34557
34566
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34558
34567
|
function isEvaluatorKind(value) {
|
|
34559
34568
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -34920,6 +34929,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34920
34929
|
}
|
|
34921
34930
|
}
|
|
34922
34931
|
const _model = asString2(rawEvaluator.model);
|
|
34932
|
+
if (typeValue === "rubric") {
|
|
34933
|
+
const rubrics = rawEvaluator.rubrics;
|
|
34934
|
+
if (!Array.isArray(rubrics)) {
|
|
34935
|
+
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
34936
|
+
continue;
|
|
34937
|
+
}
|
|
34938
|
+
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34939
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34940
|
+
description: asString2(rubric.description) ?? "",
|
|
34941
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34942
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34943
|
+
})).filter((r) => r.description.length > 0);
|
|
34944
|
+
if (parsedRubrics.length === 0) {
|
|
34945
|
+
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
34946
|
+
continue;
|
|
34947
|
+
}
|
|
34948
|
+
evaluators.push({
|
|
34949
|
+
name: name16,
|
|
34950
|
+
type: "rubric",
|
|
34951
|
+
rubrics: parsedRubrics
|
|
34952
|
+
});
|
|
34953
|
+
continue;
|
|
34954
|
+
}
|
|
34923
34955
|
evaluators.push({
|
|
34924
34956
|
name: name16,
|
|
34925
34957
|
type: "llm_judge",
|
|
@@ -35390,7 +35422,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35390
35422
|
continue;
|
|
35391
35423
|
}
|
|
35392
35424
|
const conversationId = asString5(evalcase.conversation_id);
|
|
35393
|
-
const outcome = asString5(evalcase.outcome);
|
|
35425
|
+
const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
|
|
35394
35426
|
const inputMessagesValue = evalcase.input_messages;
|
|
35395
35427
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
35396
35428
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
@@ -35444,6 +35476,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35444
35476
|
logError(`Skipping eval case '${id}': ${message}`);
|
|
35445
35477
|
continue;
|
|
35446
35478
|
}
|
|
35479
|
+
const inlineRubrics = evalcase.rubrics;
|
|
35480
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
35481
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
35482
|
+
if (typeof rubric === "string") {
|
|
35483
|
+
return {
|
|
35484
|
+
id: `rubric-${index + 1}`,
|
|
35485
|
+
description: rubric,
|
|
35486
|
+
weight: 1,
|
|
35487
|
+
required: true
|
|
35488
|
+
};
|
|
35489
|
+
}
|
|
35490
|
+
return {
|
|
35491
|
+
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
35492
|
+
description: asString5(rubric.description) ?? "",
|
|
35493
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
35494
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
35495
|
+
};
|
|
35496
|
+
}).filter((r) => r.description.length > 0);
|
|
35497
|
+
if (rubricItems.length > 0) {
|
|
35498
|
+
const rubricEvaluator = {
|
|
35499
|
+
name: "rubric",
|
|
35500
|
+
type: "rubric",
|
|
35501
|
+
rubrics: rubricItems
|
|
35502
|
+
};
|
|
35503
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
35504
|
+
}
|
|
35505
|
+
}
|
|
35447
35506
|
const userFilePaths = [];
|
|
35448
35507
|
for (const segment of inputSegments) {
|
|
35449
35508
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -35536,6 +35595,9 @@ var AzureProvider = class {
|
|
|
35536
35595
|
retryConfig: this.retryConfig
|
|
35537
35596
|
});
|
|
35538
35597
|
}
|
|
35598
|
+
asLanguageModel() {
|
|
35599
|
+
return this.model;
|
|
35600
|
+
}
|
|
35539
35601
|
};
|
|
35540
35602
|
var AnthropicProvider = class {
|
|
35541
35603
|
constructor(targetName, config2) {
|
|
@@ -35569,6 +35631,9 @@ var AnthropicProvider = class {
|
|
|
35569
35631
|
providerOptions
|
|
35570
35632
|
});
|
|
35571
35633
|
}
|
|
35634
|
+
asLanguageModel() {
|
|
35635
|
+
return this.model;
|
|
35636
|
+
}
|
|
35572
35637
|
};
|
|
35573
35638
|
var GeminiProvider = class {
|
|
35574
35639
|
constructor(targetName, config2) {
|
|
@@ -35599,6 +35664,9 @@ var GeminiProvider = class {
|
|
|
35599
35664
|
retryConfig: this.retryConfig
|
|
35600
35665
|
});
|
|
35601
35666
|
}
|
|
35667
|
+
asLanguageModel() {
|
|
35668
|
+
return this.model;
|
|
35669
|
+
}
|
|
35602
35670
|
};
|
|
35603
35671
|
function buildAzureOptions(config2) {
|
|
35604
35672
|
const options = {
|
|
@@ -35828,7 +35896,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
35828
35896
|
}
|
|
35829
35897
|
var execAsync2 = promisify2(execWithCallback);
|
|
35830
35898
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
35831
|
-
async function defaultCommandRunner(
|
|
35899
|
+
async function defaultCommandRunner(command5, options) {
|
|
35832
35900
|
const execOptions = {
|
|
35833
35901
|
cwd: options.cwd,
|
|
35834
35902
|
env: options.env,
|
|
@@ -35838,7 +35906,7 @@ async function defaultCommandRunner(command, options) {
|
|
|
35838
35906
|
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
35839
35907
|
};
|
|
35840
35908
|
try {
|
|
35841
|
-
const { stdout, stderr } = await execAsync2(
|
|
35909
|
+
const { stdout, stderr } = await execAsync2(command5, execOptions);
|
|
35842
35910
|
return {
|
|
35843
35911
|
stdout,
|
|
35844
35912
|
stderr,
|
|
@@ -37262,6 +37330,144 @@ function createProvider(target) {
|
|
|
37262
37330
|
}
|
|
37263
37331
|
}
|
|
37264
37332
|
}
|
|
37333
|
+
var rubricCheckResultSchema = external_exports.object({
|
|
37334
|
+
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37335
|
+
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37336
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37337
|
+
});
|
|
37338
|
+
var rubricEvaluationSchema = external_exports.object({
|
|
37339
|
+
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37340
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37341
|
+
});
|
|
37342
|
+
var RubricEvaluator = class {
|
|
37343
|
+
kind = "rubric";
|
|
37344
|
+
config;
|
|
37345
|
+
resolveJudgeProvider;
|
|
37346
|
+
constructor(options) {
|
|
37347
|
+
this.config = options.config;
|
|
37348
|
+
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
37349
|
+
}
|
|
37350
|
+
async evaluate(context) {
|
|
37351
|
+
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
37352
|
+
if (!judgeProvider) {
|
|
37353
|
+
throw new Error("No judge provider available for rubric evaluation");
|
|
37354
|
+
}
|
|
37355
|
+
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
37356
|
+
throw new Error(
|
|
37357
|
+
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
37358
|
+
);
|
|
37359
|
+
}
|
|
37360
|
+
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
37361
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37362
|
+
if (!model) {
|
|
37363
|
+
throw new Error("Judge provider does not support language model interface");
|
|
37364
|
+
}
|
|
37365
|
+
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37366
|
+
You must return a valid JSON object matching this schema:
|
|
37367
|
+
{
|
|
37368
|
+
"checks": [
|
|
37369
|
+
{
|
|
37370
|
+
"id": "string (rubric id)",
|
|
37371
|
+
"satisfied": boolean,
|
|
37372
|
+
"reasoning": "string (brief explanation)"
|
|
37373
|
+
}
|
|
37374
|
+
],
|
|
37375
|
+
"overall_reasoning": "string (summary)"
|
|
37376
|
+
}`;
|
|
37377
|
+
let result;
|
|
37378
|
+
let lastError;
|
|
37379
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37380
|
+
try {
|
|
37381
|
+
const { text: text2 } = await generateText({
|
|
37382
|
+
model,
|
|
37383
|
+
system,
|
|
37384
|
+
prompt
|
|
37385
|
+
});
|
|
37386
|
+
const cleaned = text2.replace(/```json\n?|```/g, "").trim();
|
|
37387
|
+
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
37388
|
+
break;
|
|
37389
|
+
} catch (e) {
|
|
37390
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37391
|
+
}
|
|
37392
|
+
}
|
|
37393
|
+
if (!result) {
|
|
37394
|
+
throw new Error(
|
|
37395
|
+
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
37396
|
+
);
|
|
37397
|
+
}
|
|
37398
|
+
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
37399
|
+
return {
|
|
37400
|
+
score,
|
|
37401
|
+
verdict,
|
|
37402
|
+
hits,
|
|
37403
|
+
misses,
|
|
37404
|
+
expectedAspectCount: this.config.rubrics.length,
|
|
37405
|
+
reasoning: result.overall_reasoning,
|
|
37406
|
+
evaluatorRawRequest: {
|
|
37407
|
+
prompt
|
|
37408
|
+
}
|
|
37409
|
+
};
|
|
37410
|
+
}
|
|
37411
|
+
buildPrompt(context, rubrics) {
|
|
37412
|
+
const parts = [
|
|
37413
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37414
|
+
"",
|
|
37415
|
+
"[[ ## question ## ]]",
|
|
37416
|
+
context.evalCase.question,
|
|
37417
|
+
"",
|
|
37418
|
+
"[[ ## expected_outcome ## ]]",
|
|
37419
|
+
context.evalCase.expected_outcome,
|
|
37420
|
+
""
|
|
37421
|
+
];
|
|
37422
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37423
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37424
|
+
}
|
|
37425
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37426
|
+
for (const rubric of rubrics) {
|
|
37427
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37428
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37429
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37430
|
+
}
|
|
37431
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37432
|
+
return parts.join("\n");
|
|
37433
|
+
}
|
|
37434
|
+
calculateScore(result, rubrics) {
|
|
37435
|
+
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
37436
|
+
const hits = [];
|
|
37437
|
+
const misses = [];
|
|
37438
|
+
let totalWeight = 0;
|
|
37439
|
+
let earnedWeight = 0;
|
|
37440
|
+
let failedRequired = false;
|
|
37441
|
+
for (const check2 of result.checks) {
|
|
37442
|
+
const rubric = rubricMap.get(check2.id);
|
|
37443
|
+
if (!rubric) {
|
|
37444
|
+
continue;
|
|
37445
|
+
}
|
|
37446
|
+
totalWeight += rubric.weight;
|
|
37447
|
+
if (check2.satisfied) {
|
|
37448
|
+
earnedWeight += rubric.weight;
|
|
37449
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37450
|
+
} else {
|
|
37451
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37452
|
+
if (rubric.required) {
|
|
37453
|
+
failedRequired = true;
|
|
37454
|
+
}
|
|
37455
|
+
}
|
|
37456
|
+
}
|
|
37457
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37458
|
+
let verdict;
|
|
37459
|
+
if (failedRequired) {
|
|
37460
|
+
verdict = "fail";
|
|
37461
|
+
} else if (score >= 0.8) {
|
|
37462
|
+
verdict = "pass";
|
|
37463
|
+
} else if (score >= 0.6) {
|
|
37464
|
+
verdict = "borderline";
|
|
37465
|
+
} else {
|
|
37466
|
+
verdict = "fail";
|
|
37467
|
+
}
|
|
37468
|
+
return { score, verdict, hits, misses };
|
|
37469
|
+
}
|
|
37470
|
+
};
|
|
37265
37471
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37266
37472
|
|
|
37267
37473
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -37624,7 +37830,7 @@ function pLimit(concurrency) {
|
|
|
37624
37830
|
activeCount--;
|
|
37625
37831
|
resumeNext();
|
|
37626
37832
|
};
|
|
37627
|
-
const
|
|
37833
|
+
const run2 = async (function_, resolve2, arguments_) => {
|
|
37628
37834
|
const result = (async () => function_(...arguments_))();
|
|
37629
37835
|
resolve2(result);
|
|
37630
37836
|
try {
|
|
@@ -37637,7 +37843,7 @@ function pLimit(concurrency) {
|
|
|
37637
37843
|
new Promise((internalResolve) => {
|
|
37638
37844
|
queue.enqueue(internalResolve);
|
|
37639
37845
|
}).then(
|
|
37640
|
-
|
|
37846
|
+
run2.bind(void 0, function_, resolve2, arguments_)
|
|
37641
37847
|
);
|
|
37642
37848
|
(async () => {
|
|
37643
37849
|
await Promise.resolve();
|
|
@@ -38214,6 +38420,7 @@ async function runEvaluatorList(options) {
|
|
|
38214
38420
|
name: evaluator.name,
|
|
38215
38421
|
type: evaluator.type,
|
|
38216
38422
|
score: score2.score,
|
|
38423
|
+
verdict: score2.verdict,
|
|
38217
38424
|
hits: score2.hits,
|
|
38218
38425
|
misses: score2.misses,
|
|
38219
38426
|
reasoning: score2.reasoning,
|
|
@@ -38241,6 +38448,40 @@ async function runEvaluatorList(options) {
|
|
|
38241
38448
|
name: evaluator.name,
|
|
38242
38449
|
type: evaluator.type,
|
|
38243
38450
|
score: score2.score,
|
|
38451
|
+
verdict: score2.verdict,
|
|
38452
|
+
hits: score2.hits,
|
|
38453
|
+
misses: score2.misses,
|
|
38454
|
+
reasoning: score2.reasoning,
|
|
38455
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38456
|
+
});
|
|
38457
|
+
continue;
|
|
38458
|
+
}
|
|
38459
|
+
if (evaluator.type === "rubric") {
|
|
38460
|
+
const rubricEvaluator = new RubricEvaluator({
|
|
38461
|
+
config: evaluator,
|
|
38462
|
+
resolveJudgeProvider: async (context) => {
|
|
38463
|
+
if (context.judgeProvider) {
|
|
38464
|
+
return context.judgeProvider;
|
|
38465
|
+
}
|
|
38466
|
+
return judgeProvider;
|
|
38467
|
+
}
|
|
38468
|
+
});
|
|
38469
|
+
const score2 = await rubricEvaluator.evaluate({
|
|
38470
|
+
evalCase,
|
|
38471
|
+
candidate,
|
|
38472
|
+
target,
|
|
38473
|
+
provider,
|
|
38474
|
+
attempt,
|
|
38475
|
+
promptInputs,
|
|
38476
|
+
now,
|
|
38477
|
+
judgeProvider
|
|
38478
|
+
});
|
|
38479
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
38480
|
+
evaluatorResults.push({
|
|
38481
|
+
name: evaluator.name,
|
|
38482
|
+
type: evaluator.type,
|
|
38483
|
+
score: score2.score,
|
|
38484
|
+
verdict: score2.verdict,
|
|
38244
38485
|
hits: score2.hits,
|
|
38245
38486
|
misses: score2.misses,
|
|
38246
38487
|
reasoning: score2.reasoning,
|
|
@@ -38470,8 +38711,81 @@ function isTimeoutLike(error40) {
|
|
|
38470
38711
|
const value = String(error40).toLowerCase();
|
|
38471
38712
|
return value.includes("timeout");
|
|
38472
38713
|
}
|
|
38473
|
-
|
|
38474
|
-
|
|
38714
|
+
var rubricItemSchema = external_exports.object({
|
|
38715
|
+
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
38716
|
+
description: external_exports.string().describe("What this rubric checks for"),
|
|
38717
|
+
weight: external_exports.number().default(1).describe("Relative importance (default 1.0)"),
|
|
38718
|
+
required: external_exports.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
38719
|
+
});
|
|
38720
|
+
var rubricGenerationSchema = external_exports.object({
|
|
38721
|
+
rubrics: external_exports.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
38722
|
+
});
|
|
38723
|
+
async function generateRubrics(options) {
|
|
38724
|
+
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
38725
|
+
const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
|
|
38726
|
+
const model = provider.asLanguageModel?.();
|
|
38727
|
+
if (!model) {
|
|
38728
|
+
throw new Error("Provider does not support language model interface");
|
|
38729
|
+
}
|
|
38730
|
+
const system = `You are an expert at creating evaluation rubrics.
|
|
38731
|
+
You must return a valid JSON object matching this schema:
|
|
38732
|
+
{
|
|
38733
|
+
"rubrics": [
|
|
38734
|
+
{
|
|
38735
|
+
"id": "string (short identifier)",
|
|
38736
|
+
"description": "string (what to check)",
|
|
38737
|
+
"weight": number (default 1.0),
|
|
38738
|
+
"required": boolean (default true)
|
|
38739
|
+
}
|
|
38740
|
+
]
|
|
38741
|
+
}`;
|
|
38742
|
+
let result;
|
|
38743
|
+
let lastError;
|
|
38744
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
38745
|
+
try {
|
|
38746
|
+
const { text: text2 } = await generateText({
|
|
38747
|
+
model,
|
|
38748
|
+
system,
|
|
38749
|
+
prompt
|
|
38750
|
+
});
|
|
38751
|
+
const cleaned = text2.replace(/```json\n?|```/g, "").trim();
|
|
38752
|
+
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
38753
|
+
break;
|
|
38754
|
+
} catch (e) {
|
|
38755
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
38756
|
+
}
|
|
38757
|
+
}
|
|
38758
|
+
if (!result) {
|
|
38759
|
+
throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
|
|
38760
|
+
}
|
|
38761
|
+
return result.rubrics;
|
|
38762
|
+
}
|
|
38763
|
+
function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
38764
|
+
const parts = [
|
|
38765
|
+
"You are an expert at creating evaluation rubrics.",
|
|
38766
|
+
"Given the expected outcome (and optionally the question and reference answer),",
|
|
38767
|
+
"generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
|
|
38768
|
+
"",
|
|
38769
|
+
"Each rubric should:",
|
|
38770
|
+
"- Be specific and testable",
|
|
38771
|
+
"- Have a short, descriptive ID",
|
|
38772
|
+
"- Include a clear description of what to check",
|
|
38773
|
+
"- Indicate if it is required (mandatory) or optional",
|
|
38774
|
+
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
38775
|
+
"",
|
|
38776
|
+
"Generate 3-7 rubric items that comprehensively cover the expected outcome.",
|
|
38777
|
+
"",
|
|
38778
|
+
"[[ ## expected_outcome ## ]]",
|
|
38779
|
+
expectedOutcome,
|
|
38780
|
+
""
|
|
38781
|
+
];
|
|
38782
|
+
if (question && question.trim().length > 0) {
|
|
38783
|
+
parts.push("[[ ## question ## ]]", question, "");
|
|
38784
|
+
}
|
|
38785
|
+
if (referenceAnswer && referenceAnswer.trim().length > 0) {
|
|
38786
|
+
parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
|
|
38787
|
+
}
|
|
38788
|
+
return parts.join("\n");
|
|
38475
38789
|
}
|
|
38476
38790
|
|
|
38477
38791
|
// src/commands/eval/env.ts
|
|
@@ -38927,12 +39241,12 @@ var ProgressDisplay = class {
|
|
|
38927
39241
|
}
|
|
38928
39242
|
addLogPaths(paths) {
|
|
38929
39243
|
const newPaths = [];
|
|
38930
|
-
for (const
|
|
38931
|
-
if (this.logPathSet.has(
|
|
39244
|
+
for (const path26 of paths) {
|
|
39245
|
+
if (this.logPathSet.has(path26)) {
|
|
38932
39246
|
continue;
|
|
38933
39247
|
}
|
|
38934
|
-
this.logPathSet.add(
|
|
38935
|
-
newPaths.push(
|
|
39248
|
+
this.logPathSet.add(path26);
|
|
39249
|
+
newPaths.push(path26);
|
|
38936
39250
|
}
|
|
38937
39251
|
if (newPaths.length === 0) {
|
|
38938
39252
|
return;
|
|
@@ -38948,8 +39262,8 @@ var ProgressDisplay = class {
|
|
|
38948
39262
|
this.hasPrintedLogHeader = true;
|
|
38949
39263
|
}
|
|
38950
39264
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
38951
|
-
newPaths.forEach((
|
|
38952
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
39265
|
+
newPaths.forEach((path26, offset) => {
|
|
39266
|
+
console.log(`${startIndex + offset + 1}. ${path26}`);
|
|
38953
39267
|
});
|
|
38954
39268
|
}
|
|
38955
39269
|
scheduleRender() {
|
|
@@ -38997,8 +39311,8 @@ var ProgressDisplay = class {
|
|
|
38997
39311
|
if (this.logPaths.length > 0) {
|
|
38998
39312
|
lines.push("");
|
|
38999
39313
|
lines.push("Codex CLI logs:");
|
|
39000
|
-
this.logPaths.forEach((
|
|
39001
|
-
lines.push(`${index + 1}. ${
|
|
39314
|
+
this.logPaths.forEach((path26, index) => {
|
|
39315
|
+
lines.push(`${index + 1}. ${path26}`);
|
|
39002
39316
|
});
|
|
39003
39317
|
}
|
|
39004
39318
|
const rowCount = this.getRenderedRowCount(lines);
|
|
@@ -39203,24 +39517,20 @@ function formatEvaluationSummary(summary) {
|
|
|
39203
39517
|
return lines.join("\n");
|
|
39204
39518
|
}
|
|
39205
39519
|
|
|
39206
|
-
// src/commands/eval/targets.ts
|
|
39207
|
-
import { constants as constants5 } from "node:fs";
|
|
39208
|
-
import { access as access5 } from "node:fs/promises";
|
|
39209
|
-
import path17 from "node:path";
|
|
39210
|
-
|
|
39211
39520
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
39212
39521
|
import { readFile as readFile7 } from "node:fs/promises";
|
|
39522
|
+
import path16 from "node:path";
|
|
39213
39523
|
import { parse as parse6 } from "yaml";
|
|
39214
39524
|
import { readFile as readFile23 } from "node:fs/promises";
|
|
39215
|
-
import
|
|
39525
|
+
import path23 from "node:path";
|
|
39216
39526
|
import { parse as parse23 } from "yaml";
|
|
39217
39527
|
import { readFile as readFile33 } from "node:fs/promises";
|
|
39218
|
-
import
|
|
39528
|
+
import path33 from "node:path";
|
|
39219
39529
|
import { parse as parse33 } from "yaml";
|
|
39220
39530
|
import { readFile as readFile43 } from "node:fs/promises";
|
|
39221
39531
|
import { parse as parse42 } from "yaml";
|
|
39222
39532
|
import { readFile as readFile52 } from "node:fs/promises";
|
|
39223
|
-
import
|
|
39533
|
+
import path43 from "node:path";
|
|
39224
39534
|
import { parse as parse52 } from "yaml";
|
|
39225
39535
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
39226
39536
|
var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
@@ -39230,12 +39540,12 @@ async function detectFileType(filePath) {
|
|
|
39230
39540
|
const content = await readFile7(filePath, "utf8");
|
|
39231
39541
|
const parsed = parse6(content);
|
|
39232
39542
|
if (typeof parsed !== "object" || parsed === null) {
|
|
39233
|
-
return
|
|
39543
|
+
return inferFileTypeFromPath(filePath);
|
|
39234
39544
|
}
|
|
39235
39545
|
const record2 = parsed;
|
|
39236
39546
|
const schema = record2.$schema;
|
|
39237
39547
|
if (typeof schema !== "string") {
|
|
39238
|
-
return
|
|
39548
|
+
return inferFileTypeFromPath(filePath);
|
|
39239
39549
|
}
|
|
39240
39550
|
switch (schema) {
|
|
39241
39551
|
case SCHEMA_EVAL_V2:
|
|
@@ -39245,18 +39555,31 @@ async function detectFileType(filePath) {
|
|
|
39245
39555
|
case SCHEMA_CONFIG_V22:
|
|
39246
39556
|
return "config";
|
|
39247
39557
|
default:
|
|
39248
|
-
return
|
|
39558
|
+
return inferFileTypeFromPath(filePath);
|
|
39249
39559
|
}
|
|
39250
39560
|
} catch {
|
|
39251
|
-
return
|
|
39561
|
+
return inferFileTypeFromPath(filePath);
|
|
39252
39562
|
}
|
|
39253
39563
|
}
|
|
39564
|
+
function inferFileTypeFromPath(filePath) {
|
|
39565
|
+
const normalized = path16.normalize(filePath).replace(/\\/g, "/");
|
|
39566
|
+
const basename = path16.basename(filePath);
|
|
39567
|
+
if (normalized.includes("/.agentv/")) {
|
|
39568
|
+
if (basename === "config.yaml" || basename === "config.yml") {
|
|
39569
|
+
return "config";
|
|
39570
|
+
}
|
|
39571
|
+
if (basename === "targets.yaml" || basename === "targets.yml") {
|
|
39572
|
+
return "targets";
|
|
39573
|
+
}
|
|
39574
|
+
}
|
|
39575
|
+
return "eval";
|
|
39576
|
+
}
|
|
39254
39577
|
function isObject2(value) {
|
|
39255
39578
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
39256
39579
|
}
|
|
39257
39580
|
async function validateEvalFile(filePath) {
|
|
39258
39581
|
const errors = [];
|
|
39259
|
-
const absolutePath =
|
|
39582
|
+
const absolutePath = path23.resolve(filePath);
|
|
39260
39583
|
let parsed;
|
|
39261
39584
|
try {
|
|
39262
39585
|
const content = await readFile23(absolutePath, "utf8");
|
|
@@ -39323,13 +39646,13 @@ async function validateEvalFile(filePath) {
|
|
|
39323
39646
|
message: "Missing or invalid 'id' field (must be a non-empty string)"
|
|
39324
39647
|
});
|
|
39325
39648
|
}
|
|
39326
|
-
const
|
|
39327
|
-
if (typeof
|
|
39649
|
+
const expectedOutcome = evalCase.expected_outcome ?? evalCase.outcome;
|
|
39650
|
+
if (expectedOutcome !== void 0 && (typeof expectedOutcome !== "string" || expectedOutcome.trim().length === 0)) {
|
|
39328
39651
|
errors.push({
|
|
39329
39652
|
severity: "error",
|
|
39330
39653
|
filePath: absolutePath,
|
|
39331
|
-
location: `${location}.
|
|
39332
|
-
message: "
|
|
39654
|
+
location: `${location}.expected_outcome`,
|
|
39655
|
+
message: "Invalid 'expected_outcome' or 'outcome' field (must be a non-empty string if provided)"
|
|
39333
39656
|
});
|
|
39334
39657
|
}
|
|
39335
39658
|
const inputMessages = evalCase.input_messages;
|
|
@@ -39605,7 +39928,7 @@ function validateUnknownSettings(target, provider, absolutePath, location, error
|
|
|
39605
39928
|
}
|
|
39606
39929
|
async function validateTargetsFile(filePath) {
|
|
39607
39930
|
const errors = [];
|
|
39608
|
-
const absolutePath =
|
|
39931
|
+
const absolutePath = path33.resolve(filePath);
|
|
39609
39932
|
let parsed;
|
|
39610
39933
|
try {
|
|
39611
39934
|
const content = await readFile33(absolutePath, "utf8");
|
|
@@ -39884,8 +40207,8 @@ async function validateConfigFile(filePath) {
|
|
|
39884
40207
|
}
|
|
39885
40208
|
const config2 = parsed;
|
|
39886
40209
|
const schema = config2.$schema;
|
|
39887
|
-
if (schema !== SCHEMA_CONFIG_V222) {
|
|
39888
|
-
const message =
|
|
40210
|
+
if (schema !== void 0 && schema !== SCHEMA_CONFIG_V222) {
|
|
40211
|
+
const message = `Invalid $schema value '${schema}'. Expected '${SCHEMA_CONFIG_V222}' or omit the field.`;
|
|
39889
40212
|
errors.push({
|
|
39890
40213
|
severity: "error",
|
|
39891
40214
|
filePath,
|
|
@@ -39947,7 +40270,7 @@ function isObject3(value) {
|
|
|
39947
40270
|
}
|
|
39948
40271
|
async function validateFileReferences(evalFilePath) {
|
|
39949
40272
|
const errors = [];
|
|
39950
|
-
const absolutePath =
|
|
40273
|
+
const absolutePath = path43.resolve(evalFilePath);
|
|
39951
40274
|
const gitRoot = await findGitRoot(absolutePath);
|
|
39952
40275
|
if (!gitRoot) {
|
|
39953
40276
|
errors.push({
|
|
@@ -40064,19 +40387,16 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
40064
40387
|
}
|
|
40065
40388
|
}
|
|
40066
40389
|
|
|
40067
|
-
// src/
|
|
40390
|
+
// src/utils/targets.ts
|
|
40391
|
+
import { constants as constants5 } from "node:fs";
|
|
40392
|
+
import { access as access5 } from "node:fs/promises";
|
|
40393
|
+
import path17 from "node:path";
|
|
40068
40394
|
var TARGET_FILE_CANDIDATES = [
|
|
40069
40395
|
"targets.yaml",
|
|
40070
40396
|
"targets.yml",
|
|
40071
40397
|
path17.join(".agentv", "targets.yaml"),
|
|
40072
40398
|
path17.join(".agentv", "targets.yml")
|
|
40073
40399
|
];
|
|
40074
|
-
var ANSI_YELLOW7 = "\x1B[33m";
|
|
40075
|
-
var ANSI_RED2 = "\x1B[31m";
|
|
40076
|
-
var ANSI_RESET7 = "\x1B[0m";
|
|
40077
|
-
function isTTY() {
|
|
40078
|
-
return process.stdout.isTTY ?? false;
|
|
40079
|
-
}
|
|
40080
40400
|
async function fileExists5(filePath) {
|
|
40081
40401
|
try {
|
|
40082
40402
|
await access5(filePath, constants5.F_OK);
|
|
@@ -40085,10 +40405,6 @@ async function fileExists5(filePath) {
|
|
|
40085
40405
|
return false;
|
|
40086
40406
|
}
|
|
40087
40407
|
}
|
|
40088
|
-
async function readTestSuiteTarget(testFilePath) {
|
|
40089
|
-
const metadata = await readTestSuiteMetadata(testFilePath);
|
|
40090
|
-
return metadata.target;
|
|
40091
|
-
}
|
|
40092
40408
|
async function discoverTargetsFile(options) {
|
|
40093
40409
|
const { explicitPath, testFilePath, repoRoot, cwd } = options;
|
|
40094
40410
|
if (explicitPath) {
|
|
@@ -40119,6 +40435,18 @@ async function discoverTargetsFile(options) {
|
|
|
40119
40435
|
}
|
|
40120
40436
|
throw new Error("Unable to locate targets.yaml. Use --targets to specify the file explicitly.");
|
|
40121
40437
|
}
|
|
40438
|
+
|
|
40439
|
+
// src/commands/eval/targets.ts
|
|
40440
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
40441
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
40442
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
40443
|
+
function isTTY() {
|
|
40444
|
+
return process.stdout.isTTY ?? false;
|
|
40445
|
+
}
|
|
40446
|
+
async function readTestSuiteTarget(testFilePath) {
|
|
40447
|
+
const metadata = await readTestSuiteMetadata(testFilePath);
|
|
40448
|
+
return metadata.target;
|
|
40449
|
+
}
|
|
40122
40450
|
function pickTargetName(options) {
|
|
40123
40451
|
const cliName = options.cliTargetName?.trim();
|
|
40124
40452
|
if (cliName && cliName !== "default") {
|
|
@@ -40299,12 +40627,12 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
40299
40627
|
const extension = getDefaultExtension(format);
|
|
40300
40628
|
return path18.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
40301
40629
|
}
|
|
40302
|
-
function resolvePromptDirectory(
|
|
40303
|
-
if (
|
|
40630
|
+
function resolvePromptDirectory(option4, cwd) {
|
|
40631
|
+
if (option4 === void 0) {
|
|
40304
40632
|
return void 0;
|
|
40305
40633
|
}
|
|
40306
|
-
if (typeof
|
|
40307
|
-
return path18.resolve(cwd,
|
|
40634
|
+
if (typeof option4 === "string" && option4.trim().length > 0) {
|
|
40635
|
+
return path18.resolve(cwd, option4);
|
|
40308
40636
|
}
|
|
40309
40637
|
return path18.join(cwd, ".agentv", "prompts");
|
|
40310
40638
|
}
|
|
@@ -40608,56 +40936,119 @@ async function resolveEvaluationRunner() {
|
|
|
40608
40936
|
}
|
|
40609
40937
|
|
|
40610
40938
|
// src/commands/eval/index.ts
|
|
40611
|
-
|
|
40612
|
-
|
|
40613
|
-
|
|
40614
|
-
|
|
40615
|
-
|
|
40616
|
-
|
|
40617
|
-
|
|
40618
|
-
|
|
40619
|
-
|
|
40620
|
-
|
|
40621
|
-
|
|
40622
|
-
|
|
40623
|
-
|
|
40624
|
-
|
|
40625
|
-
|
|
40626
|
-
|
|
40627
|
-
|
|
40628
|
-
|
|
40629
|
-
|
|
40630
|
-
|
|
40631
|
-
|
|
40632
|
-
|
|
40633
|
-
|
|
40634
|
-
|
|
40635
|
-
|
|
40636
|
-
|
|
40637
|
-
|
|
40638
|
-
|
|
40639
|
-
|
|
40640
|
-
|
|
40641
|
-
|
|
40642
|
-
|
|
40643
|
-
|
|
40644
|
-
|
|
40645
|
-
|
|
40646
|
-
|
|
40647
|
-
|
|
40648
|
-
|
|
40649
|
-
|
|
40650
|
-
|
|
40651
|
-
|
|
40652
|
-
|
|
40653
|
-
|
|
40654
|
-
|
|
40655
|
-
|
|
40656
|
-
|
|
40939
|
+
var evalCommand = command({
|
|
40940
|
+
name: "eval",
|
|
40941
|
+
description: "Run eval suites and report results",
|
|
40942
|
+
args: {
|
|
40943
|
+
evalPaths: restPositionals({
|
|
40944
|
+
type: string4,
|
|
40945
|
+
displayName: "eval-paths",
|
|
40946
|
+
description: "Path(s) or glob(s) to evaluation .yaml file(s)"
|
|
40947
|
+
}),
|
|
40948
|
+
target: option({
|
|
40949
|
+
type: string4,
|
|
40950
|
+
long: "target",
|
|
40951
|
+
description: "Override target name from targets.yaml",
|
|
40952
|
+
defaultValue: () => "default"
|
|
40953
|
+
}),
|
|
40954
|
+
targets: option({
|
|
40955
|
+
type: optional2(string4),
|
|
40956
|
+
long: "targets",
|
|
40957
|
+
description: "Path to targets.yaml (overrides discovery)"
|
|
40958
|
+
}),
|
|
40959
|
+
evalId: option({
|
|
40960
|
+
type: optional2(string4),
|
|
40961
|
+
long: "eval-id",
|
|
40962
|
+
description: "Run only the eval case with this identifier"
|
|
40963
|
+
}),
|
|
40964
|
+
workers: option({
|
|
40965
|
+
type: number4,
|
|
40966
|
+
long: "workers",
|
|
40967
|
+
description: "Number of parallel workers (default: 1, max: 50). Can also be set per-target in targets.yaml",
|
|
40968
|
+
defaultValue: () => 1
|
|
40969
|
+
}),
|
|
40970
|
+
out: option({
|
|
40971
|
+
type: optional2(string4),
|
|
40972
|
+
long: "out",
|
|
40973
|
+
description: "Write results to the specified path"
|
|
40974
|
+
}),
|
|
40975
|
+
outputFormat: option({
|
|
40976
|
+
type: string4,
|
|
40977
|
+
long: "output-format",
|
|
40978
|
+
description: "Output format: 'jsonl' or 'yaml' (default: jsonl)",
|
|
40979
|
+
defaultValue: () => "jsonl"
|
|
40980
|
+
}),
|
|
40981
|
+
dryRun: flag({
|
|
40982
|
+
long: "dry-run",
|
|
40983
|
+
description: "Use mock provider responses instead of real LLM calls"
|
|
40984
|
+
}),
|
|
40985
|
+
dryRunDelay: option({
|
|
40986
|
+
type: number4,
|
|
40987
|
+
long: "dry-run-delay",
|
|
40988
|
+
description: "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
|
|
40989
|
+
defaultValue: () => 0
|
|
40990
|
+
}),
|
|
40991
|
+
dryRunDelayMin: option({
|
|
40992
|
+
type: number4,
|
|
40993
|
+
long: "dry-run-delay-min",
|
|
40994
|
+
description: "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
|
|
40995
|
+
defaultValue: () => 0
|
|
40996
|
+
}),
|
|
40997
|
+
dryRunDelayMax: option({
|
|
40998
|
+
type: number4,
|
|
40999
|
+
long: "dry-run-delay-max",
|
|
41000
|
+
description: "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
|
|
41001
|
+
defaultValue: () => 0
|
|
41002
|
+
}),
|
|
41003
|
+
agentTimeout: option({
|
|
41004
|
+
type: number4,
|
|
41005
|
+
long: "agent-timeout",
|
|
41006
|
+
description: "Timeout in seconds for provider responses (default: 120)",
|
|
41007
|
+
defaultValue: () => 120
|
|
41008
|
+
}),
|
|
41009
|
+
maxRetries: option({
|
|
41010
|
+
type: number4,
|
|
41011
|
+
long: "max-retries",
|
|
41012
|
+
description: "Retry count for timeout recoveries (default: 2)",
|
|
41013
|
+
defaultValue: () => 2
|
|
41014
|
+
}),
|
|
41015
|
+
cache: flag({
|
|
41016
|
+
long: "cache",
|
|
41017
|
+
description: "Enable in-memory provider response cache"
|
|
41018
|
+
}),
|
|
41019
|
+
verbose: flag({
|
|
41020
|
+
long: "verbose",
|
|
41021
|
+
description: "Enable verbose logging"
|
|
41022
|
+
}),
|
|
41023
|
+
dumpPrompts: option({
|
|
41024
|
+
type: optional2(string4),
|
|
41025
|
+
long: "dump-prompts",
|
|
41026
|
+
description: "Directory path for persisting prompt payloads for debugging"
|
|
41027
|
+
})
|
|
41028
|
+
},
|
|
41029
|
+
handler: async (args) => {
|
|
41030
|
+
const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
|
|
41031
|
+
const dumpPrompts = args.dumpPrompts !== void 0 ? args.dumpPrompts === "." ? true : args.dumpPrompts : void 0;
|
|
41032
|
+
const rawOptions = {
|
|
41033
|
+
target: args.target,
|
|
41034
|
+
targets: args.targets,
|
|
41035
|
+
evalId: args.evalId,
|
|
41036
|
+
workers: args.workers,
|
|
41037
|
+
out: args.out,
|
|
41038
|
+
outputFormat: args.outputFormat,
|
|
41039
|
+
dryRun: args.dryRun,
|
|
41040
|
+
dryRunDelay: args.dryRunDelay,
|
|
41041
|
+
dryRunDelayMin: args.dryRunDelayMin,
|
|
41042
|
+
dryRunDelayMax: args.dryRunDelayMax,
|
|
41043
|
+
agentTimeout: args.agentTimeout,
|
|
41044
|
+
maxRetries: args.maxRetries,
|
|
41045
|
+
cache: args.cache,
|
|
41046
|
+
verbose: args.verbose,
|
|
41047
|
+
dumpPrompts
|
|
41048
|
+
};
|
|
40657
41049
|
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
40658
|
-
}
|
|
40659
|
-
|
|
40660
|
-
}
|
|
41050
|
+
}
|
|
41051
|
+
});
|
|
40661
41052
|
async function resolveEvalPaths(evalPaths, cwd) {
|
|
40662
41053
|
const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
|
|
40663
41054
|
if (normalizedInputs.length === 0) {
|
|
@@ -40705,14 +41096,201 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
40705
41096
|
return sorted;
|
|
40706
41097
|
}
|
|
40707
41098
|
|
|
41099
|
+
// src/commands/generate/index.ts
|
|
41100
|
+
import { command as command2, flag as flag2, option as option2, optional as optional3, positional as positional2, string as string5, subcommands } from "cmd-ts";
|
|
41101
|
+
|
|
41102
|
+
// src/commands/generate/rubrics.ts
|
|
41103
|
+
import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
|
|
41104
|
+
import path20 from "node:path";
|
|
41105
|
+
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
41106
|
+
import { isMap, isSeq, parseDocument } from "yaml";
|
|
41107
|
+
function isJsonObject3(value) {
|
|
41108
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
41109
|
+
}
|
|
41110
|
+
function asString6(value) {
|
|
41111
|
+
return typeof value === "string" ? value : void 0;
|
|
41112
|
+
}
|
|
41113
|
+
async function loadRubricGenerator() {
|
|
41114
|
+
const customGenerator = process.env.AGENTEVO_CLI_RUBRIC_GENERATOR;
|
|
41115
|
+
if (customGenerator) {
|
|
41116
|
+
const generatorPath = path20.resolve(customGenerator);
|
|
41117
|
+
const generatorUrl = pathToFileURL2(generatorPath).href;
|
|
41118
|
+
const module = await import(generatorUrl);
|
|
41119
|
+
return module.generateRubrics;
|
|
41120
|
+
}
|
|
41121
|
+
return generateRubrics;
|
|
41122
|
+
}
|
|
41123
|
+
async function generateRubricsCommand(options) {
|
|
41124
|
+
const { file: file2, target: targetOverride, verbose } = options;
|
|
41125
|
+
console.log(`Generating rubrics for: ${file2}`);
|
|
41126
|
+
const absolutePath = path20.resolve(file2);
|
|
41127
|
+
const content = await readFile8(absolutePath, "utf8");
|
|
41128
|
+
const doc = parseDocument(content);
|
|
41129
|
+
const parsed = doc.toJSON();
|
|
41130
|
+
if (!isJsonObject3(parsed)) {
|
|
41131
|
+
throw new Error(`Invalid YAML file format: ${file2}`);
|
|
41132
|
+
}
|
|
41133
|
+
const suite = parsed;
|
|
41134
|
+
const evalcases = suite.evalcases;
|
|
41135
|
+
if (!Array.isArray(evalcases)) {
|
|
41136
|
+
throw new Error(`No evalcases found in ${file2}`);
|
|
41137
|
+
}
|
|
41138
|
+
const targetSelection = await selectTarget({
|
|
41139
|
+
testFilePath: absolutePath,
|
|
41140
|
+
repoRoot: process.cwd(),
|
|
41141
|
+
cwd: process.cwd(),
|
|
41142
|
+
cliTargetName: targetOverride,
|
|
41143
|
+
dryRun: false,
|
|
41144
|
+
dryRunDelay: 0,
|
|
41145
|
+
dryRunDelayMin: 0,
|
|
41146
|
+
dryRunDelayMax: 0,
|
|
41147
|
+
env: process.env
|
|
41148
|
+
});
|
|
41149
|
+
if (verbose) {
|
|
41150
|
+
console.log(`Using target: ${targetSelection.targetName}`);
|
|
41151
|
+
}
|
|
41152
|
+
const provider = createProvider(targetSelection.resolvedTarget);
|
|
41153
|
+
const generateRubricsFunc = await loadRubricGenerator();
|
|
41154
|
+
let updatedCount = 0;
|
|
41155
|
+
let skippedCount = 0;
|
|
41156
|
+
const evalcasesNode = doc.getIn(["evalcases"]);
|
|
41157
|
+
if (!evalcasesNode || !isSeq(evalcasesNode)) {
|
|
41158
|
+
throw new Error("evalcases must be a sequence");
|
|
41159
|
+
}
|
|
41160
|
+
for (let i = 0; i < evalcases.length; i++) {
|
|
41161
|
+
const rawCase = evalcases[i];
|
|
41162
|
+
if (!isJsonObject3(rawCase)) {
|
|
41163
|
+
continue;
|
|
41164
|
+
}
|
|
41165
|
+
const evalCase = rawCase;
|
|
41166
|
+
const id = asString6(evalCase.id) ?? "unknown";
|
|
41167
|
+
const expectedOutcome = asString6(evalCase.expected_outcome) ?? asString6(evalCase.outcome);
|
|
41168
|
+
if (!expectedOutcome) {
|
|
41169
|
+
if (verbose) {
|
|
41170
|
+
console.log(` Skipping ${id}: no expected_outcome`);
|
|
41171
|
+
}
|
|
41172
|
+
skippedCount++;
|
|
41173
|
+
continue;
|
|
41174
|
+
}
|
|
41175
|
+
if (evalCase.rubrics !== void 0) {
|
|
41176
|
+
if (verbose) {
|
|
41177
|
+
console.log(` Skipping ${id}: rubrics already defined`);
|
|
41178
|
+
}
|
|
41179
|
+
skippedCount++;
|
|
41180
|
+
continue;
|
|
41181
|
+
}
|
|
41182
|
+
console.log(` Generating rubrics for: ${id}`);
|
|
41183
|
+
const question = extractQuestion(evalCase);
|
|
41184
|
+
const referenceAnswer = asString6(evalCase.reference_answer);
|
|
41185
|
+
const rubrics = await generateRubricsFunc({
|
|
41186
|
+
expectedOutcome,
|
|
41187
|
+
question,
|
|
41188
|
+
referenceAnswer,
|
|
41189
|
+
provider
|
|
41190
|
+
});
|
|
41191
|
+
const caseNode = evalcasesNode.items[i];
|
|
41192
|
+
if (caseNode && isMap(caseNode)) {
|
|
41193
|
+
caseNode.set(
|
|
41194
|
+
"rubrics",
|
|
41195
|
+
rubrics.map(
|
|
41196
|
+
(r) => ({
|
|
41197
|
+
id: r.id,
|
|
41198
|
+
description: r.description,
|
|
41199
|
+
weight: r.weight,
|
|
41200
|
+
required: r.required
|
|
41201
|
+
})
|
|
41202
|
+
)
|
|
41203
|
+
);
|
|
41204
|
+
}
|
|
41205
|
+
updatedCount++;
|
|
41206
|
+
if (verbose) {
|
|
41207
|
+
console.log(` Generated ${rubrics.length} rubric(s)`);
|
|
41208
|
+
}
|
|
41209
|
+
}
|
|
41210
|
+
if (updatedCount > 0) {
|
|
41211
|
+
const output = doc.toString();
|
|
41212
|
+
await writeFile6(absolutePath, output, "utf8");
|
|
41213
|
+
console.log(`
|
|
41214
|
+
Updated ${updatedCount} eval case(s) with generated rubrics`);
|
|
41215
|
+
if (skippedCount > 0) {
|
|
41216
|
+
console.log(`Skipped ${skippedCount} eval case(s)`);
|
|
41217
|
+
}
|
|
41218
|
+
} else {
|
|
41219
|
+
console.log("\nNo eval cases updated (all already have rubrics or missing expected_outcome)");
|
|
41220
|
+
}
|
|
41221
|
+
}
|
|
41222
|
+
function extractQuestion(evalCase) {
|
|
41223
|
+
const explicitQuestion = asString6(evalCase.question);
|
|
41224
|
+
if (explicitQuestion) {
|
|
41225
|
+
return explicitQuestion;
|
|
41226
|
+
}
|
|
41227
|
+
const inputMessages = evalCase.input_messages;
|
|
41228
|
+
if (!Array.isArray(inputMessages)) {
|
|
41229
|
+
return void 0;
|
|
41230
|
+
}
|
|
41231
|
+
for (const msg of inputMessages) {
|
|
41232
|
+
if (!isJsonObject3(msg)) {
|
|
41233
|
+
continue;
|
|
41234
|
+
}
|
|
41235
|
+
if (msg.role === "user" && typeof msg.content === "string") {
|
|
41236
|
+
return msg.content;
|
|
41237
|
+
}
|
|
41238
|
+
}
|
|
41239
|
+
return void 0;
|
|
41240
|
+
}
|
|
41241
|
+
|
|
41242
|
+
// src/commands/generate/index.ts
|
|
41243
|
+
var rubricsCommand = command2({
|
|
41244
|
+
name: "rubrics",
|
|
41245
|
+
description: "Generate rubrics from expected_outcome in YAML eval file",
|
|
41246
|
+
args: {
|
|
41247
|
+
file: positional2({
|
|
41248
|
+
type: string5,
|
|
41249
|
+
displayName: "file",
|
|
41250
|
+
description: "Path to YAML eval file"
|
|
41251
|
+
}),
|
|
41252
|
+
target: option2({
|
|
41253
|
+
type: optional3(string5),
|
|
41254
|
+
long: "target",
|
|
41255
|
+
short: "t",
|
|
41256
|
+
description: "Override target for rubric generation (default: file target or openai:gpt-4o)"
|
|
41257
|
+
}),
|
|
41258
|
+
verbose: flag2({
|
|
41259
|
+
long: "verbose",
|
|
41260
|
+
short: "v",
|
|
41261
|
+
description: "Show detailed progress"
|
|
41262
|
+
})
|
|
41263
|
+
},
|
|
41264
|
+
handler: async ({ file: file2, target, verbose }) => {
|
|
41265
|
+
try {
|
|
41266
|
+
await generateRubricsCommand({
|
|
41267
|
+
file: file2,
|
|
41268
|
+
target,
|
|
41269
|
+
verbose
|
|
41270
|
+
});
|
|
41271
|
+
} catch (error40) {
|
|
41272
|
+
console.error(`Error: ${error40.message}`);
|
|
41273
|
+
process.exit(1);
|
|
41274
|
+
}
|
|
41275
|
+
}
|
|
41276
|
+
});
|
|
41277
|
+
var generateCommand = subcommands({
|
|
41278
|
+
name: "generate",
|
|
41279
|
+
description: "Generate evaluation artifacts",
|
|
41280
|
+
cmds: {
|
|
41281
|
+
rubrics: rubricsCommand
|
|
41282
|
+
}
|
|
41283
|
+
});
|
|
41284
|
+
|
|
40708
41285
|
// src/commands/init/index.ts
|
|
40709
41286
|
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
40710
|
-
import
|
|
41287
|
+
import path24 from "node:path";
|
|
40711
41288
|
import * as readline from "node:readline/promises";
|
|
41289
|
+
import { command as command3, option as option3, optional as optional4, string as string6 } from "cmd-ts";
|
|
40712
41290
|
|
|
40713
41291
|
// src/templates/index.ts
|
|
40714
41292
|
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
40715
|
-
import
|
|
41293
|
+
import path21 from "node:path";
|
|
40716
41294
|
import { fileURLToPath } from "node:url";
|
|
40717
41295
|
function getGithubTemplates() {
|
|
40718
41296
|
return getTemplatesFromDir(".github");
|
|
@@ -40724,12 +41302,12 @@ function getClaudeTemplates() {
|
|
|
40724
41302
|
return getTemplatesFromDir(".claude");
|
|
40725
41303
|
}
|
|
40726
41304
|
function getTemplatesFromDir(subdir) {
|
|
40727
|
-
const currentDir =
|
|
41305
|
+
const currentDir = path21.dirname(fileURLToPath(import.meta.url));
|
|
40728
41306
|
let templatesDir;
|
|
40729
|
-
if (currentDir.includes(`${
|
|
40730
|
-
templatesDir =
|
|
41307
|
+
if (currentDir.includes(`${path21.sep}dist`)) {
|
|
41308
|
+
templatesDir = path21.join(currentDir, "templates", subdir);
|
|
40731
41309
|
} else {
|
|
40732
|
-
templatesDir =
|
|
41310
|
+
templatesDir = path21.join(currentDir, subdir);
|
|
40733
41311
|
}
|
|
40734
41312
|
return readTemplatesRecursively(templatesDir, "");
|
|
40735
41313
|
}
|
|
@@ -40737,15 +41315,15 @@ function readTemplatesRecursively(dir, relativePath) {
|
|
|
40737
41315
|
const templates = [];
|
|
40738
41316
|
const entries = readdirSync(dir);
|
|
40739
41317
|
for (const entry of entries) {
|
|
40740
|
-
const fullPath =
|
|
41318
|
+
const fullPath = path21.join(dir, entry);
|
|
40741
41319
|
const stat6 = statSync(fullPath);
|
|
40742
|
-
const entryRelativePath = relativePath ?
|
|
41320
|
+
const entryRelativePath = relativePath ? path21.join(relativePath, entry) : entry;
|
|
40743
41321
|
if (stat6.isDirectory()) {
|
|
40744
41322
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
40745
41323
|
} else {
|
|
40746
41324
|
const content = readFileSync(fullPath, "utf-8");
|
|
40747
41325
|
templates.push({
|
|
40748
|
-
path: entryRelativePath.split(
|
|
41326
|
+
path: entryRelativePath.split(path21.sep).join("/"),
|
|
40749
41327
|
// Normalize to forward slashes
|
|
40750
41328
|
content
|
|
40751
41329
|
});
|
|
@@ -40768,10 +41346,10 @@ async function promptYesNo(message) {
|
|
|
40768
41346
|
}
|
|
40769
41347
|
}
|
|
40770
41348
|
async function initCommand(options = {}) {
|
|
40771
|
-
const targetPath =
|
|
40772
|
-
const githubDir =
|
|
40773
|
-
const agentvDir =
|
|
40774
|
-
const claudeDir =
|
|
41349
|
+
const targetPath = path24.resolve(options.targetPath ?? ".");
|
|
41350
|
+
const githubDir = path24.join(targetPath, ".github");
|
|
41351
|
+
const agentvDir = path24.join(targetPath, ".agentv");
|
|
41352
|
+
const claudeDir = path24.join(targetPath, ".claude");
|
|
40775
41353
|
const githubTemplates = getGithubTemplates();
|
|
40776
41354
|
const agentvTemplates = getAgentvTemplates();
|
|
40777
41355
|
const claudeTemplates = getClaudeTemplates();
|
|
@@ -40779,32 +41357,32 @@ async function initCommand(options = {}) {
|
|
|
40779
41357
|
const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.template");
|
|
40780
41358
|
const existingFiles = [];
|
|
40781
41359
|
if (envTemplate) {
|
|
40782
|
-
const envFilePath =
|
|
41360
|
+
const envFilePath = path24.join(targetPath, ".env.template");
|
|
40783
41361
|
if (existsSync(envFilePath)) {
|
|
40784
41362
|
existingFiles.push(".env.template");
|
|
40785
41363
|
}
|
|
40786
41364
|
}
|
|
40787
41365
|
if (existsSync(githubDir)) {
|
|
40788
41366
|
for (const template of githubTemplates) {
|
|
40789
|
-
const targetFilePath =
|
|
41367
|
+
const targetFilePath = path24.join(githubDir, template.path);
|
|
40790
41368
|
if (existsSync(targetFilePath)) {
|
|
40791
|
-
existingFiles.push(
|
|
41369
|
+
existingFiles.push(path24.relative(targetPath, targetFilePath));
|
|
40792
41370
|
}
|
|
40793
41371
|
}
|
|
40794
41372
|
}
|
|
40795
41373
|
if (existsSync(agentvDir)) {
|
|
40796
41374
|
for (const template of otherAgentvTemplates) {
|
|
40797
|
-
const targetFilePath =
|
|
41375
|
+
const targetFilePath = path24.join(agentvDir, template.path);
|
|
40798
41376
|
if (existsSync(targetFilePath)) {
|
|
40799
|
-
existingFiles.push(
|
|
41377
|
+
existingFiles.push(path24.relative(targetPath, targetFilePath));
|
|
40800
41378
|
}
|
|
40801
41379
|
}
|
|
40802
41380
|
}
|
|
40803
41381
|
if (existsSync(claudeDir)) {
|
|
40804
41382
|
for (const template of claudeTemplates) {
|
|
40805
|
-
const targetFilePath =
|
|
41383
|
+
const targetFilePath = path24.join(claudeDir, template.path);
|
|
40806
41384
|
if (existsSync(targetFilePath)) {
|
|
40807
|
-
existingFiles.push(
|
|
41385
|
+
existingFiles.push(path24.relative(targetPath, targetFilePath));
|
|
40808
41386
|
}
|
|
40809
41387
|
}
|
|
40810
41388
|
}
|
|
@@ -40831,36 +41409,36 @@ async function initCommand(options = {}) {
|
|
|
40831
41409
|
mkdirSync(claudeDir, { recursive: true });
|
|
40832
41410
|
}
|
|
40833
41411
|
if (envTemplate) {
|
|
40834
|
-
const envFilePath =
|
|
41412
|
+
const envFilePath = path24.join(targetPath, ".env.template");
|
|
40835
41413
|
writeFileSync(envFilePath, envTemplate.content, "utf-8");
|
|
40836
41414
|
console.log("Created .env.template");
|
|
40837
41415
|
}
|
|
40838
41416
|
for (const template of githubTemplates) {
|
|
40839
|
-
const targetFilePath =
|
|
40840
|
-
const targetDirPath =
|
|
41417
|
+
const targetFilePath = path24.join(githubDir, template.path);
|
|
41418
|
+
const targetDirPath = path24.dirname(targetFilePath);
|
|
40841
41419
|
if (!existsSync(targetDirPath)) {
|
|
40842
41420
|
mkdirSync(targetDirPath, { recursive: true });
|
|
40843
41421
|
}
|
|
40844
41422
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
40845
|
-
console.log(`Created ${
|
|
41423
|
+
console.log(`Created ${path24.relative(targetPath, targetFilePath)}`);
|
|
40846
41424
|
}
|
|
40847
41425
|
for (const template of otherAgentvTemplates) {
|
|
40848
|
-
const targetFilePath =
|
|
40849
|
-
const targetDirPath =
|
|
41426
|
+
const targetFilePath = path24.join(agentvDir, template.path);
|
|
41427
|
+
const targetDirPath = path24.dirname(targetFilePath);
|
|
40850
41428
|
if (!existsSync(targetDirPath)) {
|
|
40851
41429
|
mkdirSync(targetDirPath, { recursive: true });
|
|
40852
41430
|
}
|
|
40853
41431
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
40854
|
-
console.log(`Created ${
|
|
41432
|
+
console.log(`Created ${path24.relative(targetPath, targetFilePath)}`);
|
|
40855
41433
|
}
|
|
40856
41434
|
for (const template of claudeTemplates) {
|
|
40857
|
-
const targetFilePath =
|
|
40858
|
-
const targetDirPath =
|
|
41435
|
+
const targetFilePath = path24.join(claudeDir, template.path);
|
|
41436
|
+
const targetDirPath = path24.dirname(targetFilePath);
|
|
40859
41437
|
if (!existsSync(targetDirPath)) {
|
|
40860
41438
|
mkdirSync(targetDirPath, { recursive: true });
|
|
40861
41439
|
}
|
|
40862
41440
|
writeFileSync(targetFilePath, template.content, "utf-8");
|
|
40863
|
-
console.log(`Created ${
|
|
41441
|
+
console.log(`Created ${path24.relative(targetPath, targetFilePath)}`);
|
|
40864
41442
|
}
|
|
40865
41443
|
console.log("\nAgentV initialized successfully!");
|
|
40866
41444
|
console.log("\nFiles installed to root:");
|
|
@@ -40868,17 +41446,17 @@ async function initCommand(options = {}) {
|
|
|
40868
41446
|
console.log(" - .env.template");
|
|
40869
41447
|
}
|
|
40870
41448
|
console.log(`
|
|
40871
|
-
Files installed to ${
|
|
41449
|
+
Files installed to ${path24.relative(targetPath, githubDir)}:`);
|
|
40872
41450
|
for (const t of githubTemplates) {
|
|
40873
41451
|
console.log(` - ${t.path}`);
|
|
40874
41452
|
}
|
|
40875
41453
|
console.log(`
|
|
40876
|
-
Files installed to ${
|
|
41454
|
+
Files installed to ${path24.relative(targetPath, agentvDir)}:`);
|
|
40877
41455
|
for (const t of otherAgentvTemplates) {
|
|
40878
41456
|
console.log(` - ${t.path}`);
|
|
40879
41457
|
}
|
|
40880
41458
|
console.log(`
|
|
40881
|
-
Files installed to ${
|
|
41459
|
+
Files installed to ${path24.relative(targetPath, claudeDir)}:`);
|
|
40882
41460
|
for (const t of claudeTemplates) {
|
|
40883
41461
|
console.log(` - ${t.path}`);
|
|
40884
41462
|
}
|
|
@@ -40887,15 +41465,28 @@ Files installed to ${path21.relative(targetPath, claudeDir)}:`);
|
|
|
40887
41465
|
console.log(" 2. Configure targets in .agentv/targets.yaml");
|
|
40888
41466
|
console.log(" 3. Create eval files using the schema and prompt templates");
|
|
40889
41467
|
}
|
|
41468
|
+
var initCmdTsCommand = command3({
|
|
41469
|
+
name: "init",
|
|
41470
|
+
description: "Initialize AgentV in your project (installs prompt templates and schema to .github)",
|
|
41471
|
+
args: {
|
|
41472
|
+
path: option3({
|
|
41473
|
+
type: optional4(string6),
|
|
41474
|
+
long: "path",
|
|
41475
|
+
description: "Target directory for initialization (default: current directory)"
|
|
41476
|
+
})
|
|
41477
|
+
},
|
|
41478
|
+
handler: async ({ path: targetPath }) => {
|
|
41479
|
+
try {
|
|
41480
|
+
await initCommand({ targetPath });
|
|
41481
|
+
} catch (error40) {
|
|
41482
|
+
console.error(`Error: ${error40.message}`);
|
|
41483
|
+
process.exit(1);
|
|
41484
|
+
}
|
|
41485
|
+
}
|
|
41486
|
+
});
|
|
40890
41487
|
|
|
40891
|
-
// src/commands/
|
|
40892
|
-
|
|
40893
|
-
program.command("status").description("Show the latest AgentV kernel status").action(() => {
|
|
40894
|
-
const kernel = createAgentKernel();
|
|
40895
|
-
console.log(`Kernel status: ${kernel.status}`);
|
|
40896
|
-
});
|
|
40897
|
-
return program;
|
|
40898
|
-
}
|
|
41488
|
+
// src/commands/validate/index.ts
|
|
41489
|
+
import { command as command4, restPositionals as restPositionals2, string as string7 } from "cmd-ts";
|
|
40899
41490
|
|
|
40900
41491
|
// src/commands/validate/format-output.ts
|
|
40901
41492
|
var ANSI_RED3 = "\x1B[31m";
|
|
@@ -40980,7 +41571,7 @@ function isTTY2() {
|
|
|
40980
41571
|
// src/commands/validate/validate-files.ts
|
|
40981
41572
|
import { constants as constants7 } from "node:fs";
|
|
40982
41573
|
import { access as access7, readdir as readdir3, stat as stat5 } from "node:fs/promises";
|
|
40983
|
-
import
|
|
41574
|
+
import path25 from "node:path";
|
|
40984
41575
|
async function validateFiles(paths) {
|
|
40985
41576
|
const filePaths = await expandPaths(paths);
|
|
40986
41577
|
const results = [];
|
|
@@ -40998,22 +41589,8 @@ async function validateFiles(paths) {
|
|
|
40998
41589
|
};
|
|
40999
41590
|
}
|
|
41000
41591
|
async function validateSingleFile(filePath) {
|
|
41001
|
-
const absolutePath =
|
|
41592
|
+
const absolutePath = path25.resolve(filePath);
|
|
41002
41593
|
const fileType = await detectFileType(absolutePath);
|
|
41003
|
-
if (fileType === "unknown") {
|
|
41004
|
-
return {
|
|
41005
|
-
valid: false,
|
|
41006
|
-
filePath: absolutePath,
|
|
41007
|
-
fileType: "unknown",
|
|
41008
|
-
errors: [
|
|
41009
|
-
{
|
|
41010
|
-
severity: "error",
|
|
41011
|
-
filePath: absolutePath,
|
|
41012
|
-
message: "Missing or invalid $schema field. File must declare schema: 'agentv-eval-v2', 'agentv-targets-v2', or 'agentv-config-v2'"
|
|
41013
|
-
}
|
|
41014
|
-
]
|
|
41015
|
-
};
|
|
41016
|
-
}
|
|
41017
41594
|
let result;
|
|
41018
41595
|
if (fileType === "eval") {
|
|
41019
41596
|
result = await validateEvalFile(absolutePath);
|
|
@@ -41037,7 +41614,7 @@ async function validateSingleFile(filePath) {
|
|
|
41037
41614
|
async function expandPaths(paths) {
|
|
41038
41615
|
const expanded = [];
|
|
41039
41616
|
for (const inputPath of paths) {
|
|
41040
|
-
const absolutePath =
|
|
41617
|
+
const absolutePath = path25.resolve(inputPath);
|
|
41041
41618
|
try {
|
|
41042
41619
|
await access7(absolutePath, constants7.F_OK);
|
|
41043
41620
|
} catch {
|
|
@@ -41061,7 +41638,7 @@ async function findYamlFiles(dirPath) {
|
|
|
41061
41638
|
try {
|
|
41062
41639
|
const entries = await readdir3(dirPath, { withFileTypes: true });
|
|
41063
41640
|
for (const entry of entries) {
|
|
41064
|
-
const fullPath =
|
|
41641
|
+
const fullPath = path25.join(dirPath, entry.name);
|
|
41065
41642
|
if (entry.isDirectory()) {
|
|
41066
41643
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
41067
41644
|
continue;
|
|
@@ -41078,12 +41655,12 @@ async function findYamlFiles(dirPath) {
|
|
|
41078
41655
|
return results;
|
|
41079
41656
|
}
|
|
41080
41657
|
function isYamlFile(filePath) {
|
|
41081
|
-
const ext =
|
|
41658
|
+
const ext = path25.extname(filePath).toLowerCase();
|
|
41082
41659
|
return ext === ".yaml" || ext === ".yml";
|
|
41083
41660
|
}
|
|
41084
41661
|
|
|
41085
41662
|
// src/commands/validate/index.ts
|
|
41086
|
-
async function runValidateCommand(paths
|
|
41663
|
+
async function runValidateCommand(paths) {
|
|
41087
41664
|
if (paths.length === 0) {
|
|
41088
41665
|
console.error("Error: No paths specified. Usage: agentv validate <paths...>");
|
|
41089
41666
|
process.exit(1);
|
|
@@ -41095,46 +41672,45 @@ async function runValidateCommand(paths, _options) {
|
|
|
41095
41672
|
process.exit(1);
|
|
41096
41673
|
}
|
|
41097
41674
|
}
|
|
41098
|
-
|
|
41099
|
-
|
|
41675
|
+
var validateCommand = command4({
|
|
41676
|
+
name: "validate",
|
|
41677
|
+
description: "Validate AgentV eval and targets YAML files",
|
|
41678
|
+
args: {
|
|
41679
|
+
paths: restPositionals2({
|
|
41680
|
+
type: string7,
|
|
41681
|
+
displayName: "paths",
|
|
41682
|
+
description: "Files or directories to validate"
|
|
41683
|
+
})
|
|
41684
|
+
},
|
|
41685
|
+
handler: async ({ paths }) => {
|
|
41100
41686
|
try {
|
|
41101
|
-
await runValidateCommand(paths
|
|
41687
|
+
await runValidateCommand(paths);
|
|
41102
41688
|
} catch (error40) {
|
|
41103
41689
|
console.error(`Error: ${error40.message}`);
|
|
41104
41690
|
process.exit(1);
|
|
41105
41691
|
}
|
|
41106
|
-
}
|
|
41107
|
-
|
|
41108
|
-
}
|
|
41692
|
+
}
|
|
41693
|
+
});
|
|
41109
41694
|
|
|
41110
41695
|
// src/index.ts
|
|
41111
41696
|
var packageJson = JSON.parse(readFileSync2(new URL("../package.json", import.meta.url), "utf8"));
|
|
41112
|
-
|
|
41113
|
-
|
|
41114
|
-
|
|
41115
|
-
|
|
41116
|
-
|
|
41117
|
-
|
|
41118
|
-
|
|
41119
|
-
|
|
41120
|
-
|
|
41121
|
-
|
|
41122
|
-
|
|
41123
|
-
} catch (error40) {
|
|
41124
|
-
console.error(`Error: ${error40.message}`);
|
|
41125
|
-
process.exit(1);
|
|
41126
|
-
}
|
|
41127
|
-
});
|
|
41128
|
-
return program;
|
|
41129
|
-
}
|
|
41697
|
+
var app = subcommands2({
|
|
41698
|
+
name: "agentv",
|
|
41699
|
+
description: "AgentV CLI",
|
|
41700
|
+
version: packageJson.version,
|
|
41701
|
+
cmds: {
|
|
41702
|
+
eval: evalCommand,
|
|
41703
|
+
validate: validateCommand,
|
|
41704
|
+
generate: generateCommand,
|
|
41705
|
+
init: initCmdTsCommand
|
|
41706
|
+
}
|
|
41707
|
+
});
|
|
41130
41708
|
async function runCli(argv = process.argv) {
|
|
41131
|
-
|
|
41132
|
-
await program.parseAsync(argv);
|
|
41133
|
-
return program;
|
|
41709
|
+
await run(binary(app), argv);
|
|
41134
41710
|
}
|
|
41135
41711
|
|
|
41136
41712
|
export {
|
|
41137
|
-
|
|
41713
|
+
app,
|
|
41138
41714
|
runCli
|
|
41139
41715
|
};
|
|
41140
|
-
//# sourceMappingURL=chunk-
|
|
41716
|
+
//# sourceMappingURL=chunk-WOCXZEH4.js.map
|