agentv 0.21.0 → 0.21.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-MA3MJNJH.js → chunk-A5T7W63L.js} +481 -420
- package/dist/chunk-A5T7W63L.js.map +1 -0
- package/dist/cli.js +5 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +3 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +3 -3
- package/package.json +8 -5
- package/dist/chunk-MA3MJNJH.js.map +0 -1
|
@@ -142,11 +142,20 @@ var require_dist = __commonJS({
|
|
|
142
142
|
|
|
143
143
|
// src/index.ts
|
|
144
144
|
import { readFileSync as readFileSync2 } from "node:fs";
|
|
145
|
-
import {
|
|
145
|
+
import { binary, run, subcommands as subcommands2 } from "cmd-ts";
|
|
146
146
|
|
|
147
147
|
// src/commands/eval/index.ts
|
|
148
148
|
import { stat as stat4 } from "node:fs/promises";
|
|
149
149
|
import path19 from "node:path";
|
|
150
|
+
import {
|
|
151
|
+
command,
|
|
152
|
+
flag,
|
|
153
|
+
number as number4,
|
|
154
|
+
option,
|
|
155
|
+
optional as optional2,
|
|
156
|
+
restPositionals,
|
|
157
|
+
string as string4
|
|
158
|
+
} from "cmd-ts";
|
|
150
159
|
import fg from "fast-glob";
|
|
151
160
|
|
|
152
161
|
// src/commands/eval/run-eval.ts
|
|
@@ -155,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
|
155
164
|
import path18 from "node:path";
|
|
156
165
|
import { pathToFileURL } from "node:url";
|
|
157
166
|
|
|
158
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-B2J23S7D.js
|
|
159
168
|
import { constants } from "node:fs";
|
|
160
169
|
import { access, readFile } from "node:fs/promises";
|
|
161
170
|
import path from "node:path";
|
|
@@ -1039,8 +1048,8 @@ var ZodType = class {
|
|
|
1039
1048
|
promise() {
|
|
1040
1049
|
return ZodPromise.create(this, this._def);
|
|
1041
1050
|
}
|
|
1042
|
-
or(
|
|
1043
|
-
return ZodUnion.create([this,
|
|
1051
|
+
or(option4) {
|
|
1052
|
+
return ZodUnion.create([this, option4], this._def);
|
|
1044
1053
|
}
|
|
1045
1054
|
and(incoming) {
|
|
1046
1055
|
return ZodIntersection.create(this, incoming, this._def);
|
|
@@ -2890,7 +2899,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2890
2899
|
return INVALID;
|
|
2891
2900
|
}
|
|
2892
2901
|
if (ctx.common.async) {
|
|
2893
|
-
return Promise.all(options.map(async (
|
|
2902
|
+
return Promise.all(options.map(async (option4) => {
|
|
2894
2903
|
const childCtx = {
|
|
2895
2904
|
...ctx,
|
|
2896
2905
|
common: {
|
|
@@ -2900,7 +2909,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2900
2909
|
parent: null
|
|
2901
2910
|
};
|
|
2902
2911
|
return {
|
|
2903
|
-
result: await
|
|
2912
|
+
result: await option4._parseAsync({
|
|
2904
2913
|
data: ctx.data,
|
|
2905
2914
|
path: ctx.path,
|
|
2906
2915
|
parent: childCtx
|
|
@@ -2911,7 +2920,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2911
2920
|
} else {
|
|
2912
2921
|
let dirty = void 0;
|
|
2913
2922
|
const issues = [];
|
|
2914
|
-
for (const
|
|
2923
|
+
for (const option4 of options) {
|
|
2915
2924
|
const childCtx = {
|
|
2916
2925
|
...ctx,
|
|
2917
2926
|
common: {
|
|
@@ -2920,7 +2929,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2920
2929
|
},
|
|
2921
2930
|
parent: null
|
|
2922
2931
|
};
|
|
2923
|
-
const result =
|
|
2932
|
+
const result = option4._parseSync({
|
|
2924
2933
|
data: ctx.data,
|
|
2925
2934
|
path: ctx.path,
|
|
2926
2935
|
parent: childCtx
|
|
@@ -3001,8 +3010,8 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3001
3010
|
}
|
|
3002
3011
|
const discriminator = this.discriminator;
|
|
3003
3012
|
const discriminatorValue = ctx.data[discriminator];
|
|
3004
|
-
const
|
|
3005
|
-
if (!
|
|
3013
|
+
const option4 = this.optionsMap.get(discriminatorValue);
|
|
3014
|
+
if (!option4) {
|
|
3006
3015
|
addIssueToContext(ctx, {
|
|
3007
3016
|
code: ZodIssueCode.invalid_union_discriminator,
|
|
3008
3017
|
options: Array.from(this.optionsMap.keys()),
|
|
@@ -3011,13 +3020,13 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3011
3020
|
return INVALID;
|
|
3012
3021
|
}
|
|
3013
3022
|
if (ctx.common.async) {
|
|
3014
|
-
return
|
|
3023
|
+
return option4._parseAsync({
|
|
3015
3024
|
data: ctx.data,
|
|
3016
3025
|
path: ctx.path,
|
|
3017
3026
|
parent: ctx
|
|
3018
3027
|
});
|
|
3019
3028
|
} else {
|
|
3020
|
-
return
|
|
3029
|
+
return option4._parseSync({
|
|
3021
3030
|
data: ctx.data,
|
|
3022
3031
|
path: ctx.path,
|
|
3023
3032
|
parent: ctx
|
|
@@ -4201,7 +4210,7 @@ var coerce = {
|
|
|
4201
4210
|
};
|
|
4202
4211
|
var NEVER = INVALID;
|
|
4203
4212
|
|
|
4204
|
-
// ../../packages/core/dist/chunk-
|
|
4213
|
+
// ../../packages/core/dist/chunk-B2J23S7D.js
|
|
4205
4214
|
async function fileExists(filePath) {
|
|
4206
4215
|
try {
|
|
4207
4216
|
await access(filePath, constants.F_OK);
|
|
@@ -4577,9 +4586,9 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4577
4586
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
4578
4587
|
const subagentRootSource = target.subagent_root ?? target.subagentRoot;
|
|
4579
4588
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
4580
|
-
const
|
|
4589
|
+
const command5 = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
4581
4590
|
return {
|
|
4582
|
-
command,
|
|
4591
|
+
command: command5,
|
|
4583
4592
|
waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
|
|
4584
4593
|
dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
|
|
4585
4594
|
subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
|
|
@@ -8081,7 +8090,7 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8081
8090
|
defineLazy(inst._zod, "optout", () => def.options.some((o) => o._zod.optout === "optional") ? "optional" : void 0);
|
|
8082
8091
|
defineLazy(inst._zod, "values", () => {
|
|
8083
8092
|
if (def.options.every((o) => o._zod.values)) {
|
|
8084
|
-
return new Set(def.options.flatMap((
|
|
8093
|
+
return new Set(def.options.flatMap((option4) => Array.from(option4._zod.values)));
|
|
8085
8094
|
}
|
|
8086
8095
|
return void 0;
|
|
8087
8096
|
});
|
|
@@ -8095,8 +8104,8 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8095
8104
|
inst._zod.parse = (payload, ctx) => {
|
|
8096
8105
|
let async = false;
|
|
8097
8106
|
const results = [];
|
|
8098
|
-
for (const
|
|
8099
|
-
const result =
|
|
8107
|
+
for (const option4 of def.options) {
|
|
8108
|
+
const result = option4._zod.run({
|
|
8100
8109
|
value: payload.value,
|
|
8101
8110
|
issues: []
|
|
8102
8111
|
}, ctx);
|
|
@@ -8121,10 +8130,10 @@ var $ZodDiscriminatedUnion = /* @__PURE__ */ $constructor("$ZodDiscriminatedUnio
|
|
|
8121
8130
|
const _super = inst._zod.parse;
|
|
8122
8131
|
defineLazy(inst._zod, "propValues", () => {
|
|
8123
8132
|
const propValues = {};
|
|
8124
|
-
for (const
|
|
8125
|
-
const pv =
|
|
8133
|
+
for (const option4 of def.options) {
|
|
8134
|
+
const pv = option4._zod.propValues;
|
|
8126
8135
|
if (!pv || Object.keys(pv).length === 0)
|
|
8127
|
-
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(
|
|
8136
|
+
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option4)}"`);
|
|
8128
8137
|
for (const [k, v] of Object.entries(pv)) {
|
|
8129
8138
|
if (!propValues[k])
|
|
8130
8139
|
propValues[k] = /* @__PURE__ */ new Set();
|
|
@@ -15328,8 +15337,8 @@ function isTransforming(_schema, _ctx) {
|
|
|
15328
15337
|
return false;
|
|
15329
15338
|
}
|
|
15330
15339
|
case "union": {
|
|
15331
|
-
for (const
|
|
15332
|
-
if (isTransforming(
|
|
15340
|
+
for (const option4 of def.options) {
|
|
15341
|
+
if (isTransforming(option4, ctx))
|
|
15333
15342
|
return true;
|
|
15334
15343
|
}
|
|
15335
15344
|
return false;
|
|
@@ -34920,25 +34929,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34920
34929
|
}
|
|
34921
34930
|
}
|
|
34922
34931
|
const _model = asString2(rawEvaluator.model);
|
|
34932
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
34933
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34934
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34935
|
+
description: asString2(rubric.description) ?? "",
|
|
34936
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34937
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34938
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
34923
34939
|
if (typeValue === "rubric") {
|
|
34924
|
-
|
|
34925
|
-
if (!Array.isArray(rubrics)) {
|
|
34940
|
+
if (!parsedRubrics) {
|
|
34926
34941
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
34927
34942
|
continue;
|
|
34928
34943
|
}
|
|
34929
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34930
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34931
|
-
description: asString2(rubric.description) ?? "",
|
|
34932
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34933
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34934
|
-
})).filter((r) => r.description.length > 0);
|
|
34935
34944
|
if (parsedRubrics.length === 0) {
|
|
34936
34945
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
34937
34946
|
continue;
|
|
34938
34947
|
}
|
|
34939
34948
|
evaluators.push({
|
|
34940
34949
|
name: name16,
|
|
34941
|
-
type: "
|
|
34950
|
+
type: "llm_judge",
|
|
34942
34951
|
rubrics: parsedRubrics
|
|
34943
34952
|
});
|
|
34944
34953
|
continue;
|
|
@@ -34947,7 +34956,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34947
34956
|
name: name16,
|
|
34948
34957
|
type: "llm_judge",
|
|
34949
34958
|
prompt,
|
|
34950
|
-
promptPath
|
|
34959
|
+
promptPath,
|
|
34960
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
34951
34961
|
});
|
|
34952
34962
|
}
|
|
34953
34963
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35488,7 +35498,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35488
35498
|
if (rubricItems.length > 0) {
|
|
35489
35499
|
const rubricEvaluator = {
|
|
35490
35500
|
name: "rubric",
|
|
35491
|
-
type: "
|
|
35501
|
+
type: "llm_judge",
|
|
35492
35502
|
rubrics: rubricItems
|
|
35493
35503
|
};
|
|
35494
35504
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -35887,7 +35897,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
35887
35897
|
}
|
|
35888
35898
|
var execAsync2 = promisify2(execWithCallback);
|
|
35889
35899
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
35890
|
-
async function defaultCommandRunner(
|
|
35900
|
+
async function defaultCommandRunner(command5, options) {
|
|
35891
35901
|
const execOptions = {
|
|
35892
35902
|
cwd: options.cwd,
|
|
35893
35903
|
env: options.env,
|
|
@@ -35897,7 +35907,7 @@ async function defaultCommandRunner(command, options) {
|
|
|
35897
35907
|
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
35898
35908
|
};
|
|
35899
35909
|
try {
|
|
35900
|
-
const { stdout, stderr } = await execAsync2(
|
|
35910
|
+
const { stdout, stderr } = await execAsync2(command5, execOptions);
|
|
35901
35911
|
return {
|
|
35902
35912
|
stdout,
|
|
35903
35913
|
stderr,
|
|
@@ -37321,144 +37331,6 @@ function createProvider(target) {
|
|
|
37321
37331
|
}
|
|
37322
37332
|
}
|
|
37323
37333
|
}
|
|
37324
|
-
var rubricCheckResultSchema = external_exports.object({
|
|
37325
|
-
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37326
|
-
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37327
|
-
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37328
|
-
});
|
|
37329
|
-
var rubricEvaluationSchema = external_exports.object({
|
|
37330
|
-
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37331
|
-
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37332
|
-
});
|
|
37333
|
-
var RubricEvaluator = class {
|
|
37334
|
-
kind = "rubric";
|
|
37335
|
-
config;
|
|
37336
|
-
resolveJudgeProvider;
|
|
37337
|
-
constructor(options) {
|
|
37338
|
-
this.config = options.config;
|
|
37339
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
37340
|
-
}
|
|
37341
|
-
async evaluate(context) {
|
|
37342
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
37343
|
-
if (!judgeProvider) {
|
|
37344
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
37345
|
-
}
|
|
37346
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
37347
|
-
throw new Error(
|
|
37348
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
37349
|
-
);
|
|
37350
|
-
}
|
|
37351
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
37352
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
37353
|
-
if (!model) {
|
|
37354
|
-
throw new Error("Judge provider does not support language model interface");
|
|
37355
|
-
}
|
|
37356
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37357
|
-
You must return a valid JSON object matching this schema:
|
|
37358
|
-
{
|
|
37359
|
-
"checks": [
|
|
37360
|
-
{
|
|
37361
|
-
"id": "string (rubric id)",
|
|
37362
|
-
"satisfied": boolean,
|
|
37363
|
-
"reasoning": "string (brief explanation)"
|
|
37364
|
-
}
|
|
37365
|
-
],
|
|
37366
|
-
"overall_reasoning": "string (summary)"
|
|
37367
|
-
}`;
|
|
37368
|
-
let result;
|
|
37369
|
-
let lastError;
|
|
37370
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37371
|
-
try {
|
|
37372
|
-
const { text: text2 } = await generateText({
|
|
37373
|
-
model,
|
|
37374
|
-
system,
|
|
37375
|
-
prompt
|
|
37376
|
-
});
|
|
37377
|
-
const cleaned = text2.replace(/```json\n?|```/g, "").trim();
|
|
37378
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
37379
|
-
break;
|
|
37380
|
-
} catch (e) {
|
|
37381
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37382
|
-
}
|
|
37383
|
-
}
|
|
37384
|
-
if (!result) {
|
|
37385
|
-
throw new Error(
|
|
37386
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
37387
|
-
);
|
|
37388
|
-
}
|
|
37389
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
37390
|
-
return {
|
|
37391
|
-
score,
|
|
37392
|
-
verdict,
|
|
37393
|
-
hits,
|
|
37394
|
-
misses,
|
|
37395
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
37396
|
-
reasoning: result.overall_reasoning,
|
|
37397
|
-
evaluatorRawRequest: {
|
|
37398
|
-
prompt
|
|
37399
|
-
}
|
|
37400
|
-
};
|
|
37401
|
-
}
|
|
37402
|
-
buildPrompt(context, rubrics) {
|
|
37403
|
-
const parts = [
|
|
37404
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37405
|
-
"",
|
|
37406
|
-
"[[ ## question ## ]]",
|
|
37407
|
-
context.evalCase.question,
|
|
37408
|
-
"",
|
|
37409
|
-
"[[ ## expected_outcome ## ]]",
|
|
37410
|
-
context.evalCase.expected_outcome,
|
|
37411
|
-
""
|
|
37412
|
-
];
|
|
37413
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37414
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37415
|
-
}
|
|
37416
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37417
|
-
for (const rubric of rubrics) {
|
|
37418
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37419
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37420
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37421
|
-
}
|
|
37422
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37423
|
-
return parts.join("\n");
|
|
37424
|
-
}
|
|
37425
|
-
calculateScore(result, rubrics) {
|
|
37426
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
37427
|
-
const hits = [];
|
|
37428
|
-
const misses = [];
|
|
37429
|
-
let totalWeight = 0;
|
|
37430
|
-
let earnedWeight = 0;
|
|
37431
|
-
let failedRequired = false;
|
|
37432
|
-
for (const check2 of result.checks) {
|
|
37433
|
-
const rubric = rubricMap.get(check2.id);
|
|
37434
|
-
if (!rubric) {
|
|
37435
|
-
continue;
|
|
37436
|
-
}
|
|
37437
|
-
totalWeight += rubric.weight;
|
|
37438
|
-
if (check2.satisfied) {
|
|
37439
|
-
earnedWeight += rubric.weight;
|
|
37440
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37441
|
-
} else {
|
|
37442
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37443
|
-
if (rubric.required) {
|
|
37444
|
-
failedRequired = true;
|
|
37445
|
-
}
|
|
37446
|
-
}
|
|
37447
|
-
}
|
|
37448
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37449
|
-
let verdict;
|
|
37450
|
-
if (failedRequired) {
|
|
37451
|
-
verdict = "fail";
|
|
37452
|
-
} else if (score >= 0.8) {
|
|
37453
|
-
verdict = "pass";
|
|
37454
|
-
} else if (score >= 0.6) {
|
|
37455
|
-
verdict = "borderline";
|
|
37456
|
-
} else {
|
|
37457
|
-
verdict = "fail";
|
|
37458
|
-
}
|
|
37459
|
-
return { score, verdict, hits, misses };
|
|
37460
|
-
}
|
|
37461
|
-
};
|
|
37462
37334
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37463
37335
|
|
|
37464
37336
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -37476,6 +37348,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
37476
37348
|
|
|
37477
37349
|
[[ ## candidate_answer ## ]]
|
|
37478
37350
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
37351
|
+
var freeformEvaluationSchema = external_exports.object({
|
|
37352
|
+
score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
37353
|
+
hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
|
|
37354
|
+
misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
|
|
37355
|
+
reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
37356
|
+
});
|
|
37357
|
+
var rubricCheckResultSchema = external_exports.object({
|
|
37358
|
+
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37359
|
+
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37360
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37361
|
+
});
|
|
37362
|
+
var rubricEvaluationSchema = external_exports.object({
|
|
37363
|
+
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37364
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37365
|
+
});
|
|
37479
37366
|
var LlmJudgeEvaluator = class {
|
|
37480
37367
|
kind = "llm_judge";
|
|
37481
37368
|
resolveJudgeProvider;
|
|
@@ -37493,9 +37380,13 @@ var LlmJudgeEvaluator = class {
|
|
|
37493
37380
|
if (!judgeProvider) {
|
|
37494
37381
|
throw new Error("No judge provider available for LLM grading");
|
|
37495
37382
|
}
|
|
37496
|
-
|
|
37383
|
+
const config2 = context.evaluator;
|
|
37384
|
+
if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
|
|
37385
|
+
return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
|
|
37386
|
+
}
|
|
37387
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
37497
37388
|
}
|
|
37498
|
-
async
|
|
37389
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
37499
37390
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37500
37391
|
const variables = {
|
|
37501
37392
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -37512,34 +37403,132 @@ var LlmJudgeEvaluator = class {
|
|
|
37512
37403
|
const systemPrompt = buildOutputSchema();
|
|
37513
37404
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
37514
37405
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
37515
|
-
const response = await judgeProvider.invoke({
|
|
37516
|
-
question: userPrompt,
|
|
37517
|
-
systemPrompt,
|
|
37518
|
-
evalCaseId: context.evalCase.id,
|
|
37519
|
-
attempt: context.attempt,
|
|
37520
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
37521
|
-
temperature: this.temperature
|
|
37522
|
-
});
|
|
37523
|
-
const parsed = parseQualityResponse(response);
|
|
37524
|
-
const score = clampScore(parsed.score ?? 0);
|
|
37525
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37526
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37527
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
37528
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37529
37406
|
const evaluatorRawRequest = {
|
|
37530
37407
|
userPrompt,
|
|
37531
37408
|
systemPrompt,
|
|
37532
37409
|
target: judgeProvider.targetName
|
|
37533
37410
|
};
|
|
37411
|
+
try {
|
|
37412
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
37413
|
+
context,
|
|
37414
|
+
judgeProvider,
|
|
37415
|
+
systemPrompt,
|
|
37416
|
+
userPrompt,
|
|
37417
|
+
schema: freeformEvaluationSchema
|
|
37418
|
+
});
|
|
37419
|
+
const score = clampScore(data.score);
|
|
37420
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37421
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37422
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
37423
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37424
|
+
return {
|
|
37425
|
+
score,
|
|
37426
|
+
verdict: scoreToVerdict(score),
|
|
37427
|
+
hits,
|
|
37428
|
+
misses,
|
|
37429
|
+
expectedAspectCount,
|
|
37430
|
+
reasoning,
|
|
37431
|
+
evaluatorRawRequest
|
|
37432
|
+
};
|
|
37433
|
+
} catch {
|
|
37434
|
+
return {
|
|
37435
|
+
score: 0,
|
|
37436
|
+
verdict: "fail",
|
|
37437
|
+
hits: [],
|
|
37438
|
+
misses: [],
|
|
37439
|
+
expectedAspectCount: 1,
|
|
37440
|
+
evaluatorRawRequest
|
|
37441
|
+
};
|
|
37442
|
+
}
|
|
37443
|
+
}
|
|
37444
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
37445
|
+
if (!rubrics || rubrics.length === 0) {
|
|
37446
|
+
throw new Error(
|
|
37447
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
37448
|
+
);
|
|
37449
|
+
}
|
|
37450
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
37451
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
37452
|
+
const evaluatorRawRequest = {
|
|
37453
|
+
userPrompt: prompt,
|
|
37454
|
+
systemPrompt,
|
|
37455
|
+
target: judgeProvider.targetName
|
|
37456
|
+
};
|
|
37457
|
+
const { data } = await this.runWithRetry({
|
|
37458
|
+
context,
|
|
37459
|
+
judgeProvider,
|
|
37460
|
+
systemPrompt,
|
|
37461
|
+
userPrompt: prompt,
|
|
37462
|
+
schema: rubricEvaluationSchema
|
|
37463
|
+
});
|
|
37464
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
37534
37465
|
return {
|
|
37535
37466
|
score,
|
|
37467
|
+
verdict,
|
|
37536
37468
|
hits,
|
|
37537
37469
|
misses,
|
|
37538
|
-
expectedAspectCount,
|
|
37539
|
-
reasoning,
|
|
37470
|
+
expectedAspectCount: rubrics.length,
|
|
37471
|
+
reasoning: data.overall_reasoning,
|
|
37540
37472
|
evaluatorRawRequest
|
|
37541
37473
|
};
|
|
37542
37474
|
}
|
|
37475
|
+
buildRubricPrompt(context, rubrics) {
|
|
37476
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37477
|
+
const parts = [
|
|
37478
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37479
|
+
"",
|
|
37480
|
+
"[[ ## question ## ]]",
|
|
37481
|
+
formattedQuestion,
|
|
37482
|
+
"",
|
|
37483
|
+
"[[ ## expected_outcome ## ]]",
|
|
37484
|
+
context.evalCase.expected_outcome,
|
|
37485
|
+
""
|
|
37486
|
+
];
|
|
37487
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37488
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37489
|
+
}
|
|
37490
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37491
|
+
for (const rubric of rubrics) {
|
|
37492
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37493
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37494
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37495
|
+
}
|
|
37496
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37497
|
+
return parts.join("\n");
|
|
37498
|
+
}
|
|
37499
|
+
async runWithRetry(options) {
|
|
37500
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
37501
|
+
let lastError;
|
|
37502
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37503
|
+
try {
|
|
37504
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37505
|
+
if (model) {
|
|
37506
|
+
const { text: text2 } = await generateText({
|
|
37507
|
+
model,
|
|
37508
|
+
system: systemPrompt,
|
|
37509
|
+
prompt: userPrompt,
|
|
37510
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
37511
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
37512
|
+
});
|
|
37513
|
+
const data2 = schema.parse(parseJsonFromText(text2));
|
|
37514
|
+
return { data: data2 };
|
|
37515
|
+
}
|
|
37516
|
+
const response = await judgeProvider.invoke({
|
|
37517
|
+
question: userPrompt,
|
|
37518
|
+
systemPrompt,
|
|
37519
|
+
evalCaseId: context.evalCase.id,
|
|
37520
|
+
attempt: context.attempt,
|
|
37521
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
37522
|
+
temperature: this.temperature
|
|
37523
|
+
});
|
|
37524
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
37525
|
+
return { data, providerResponse: response };
|
|
37526
|
+
} catch (e) {
|
|
37527
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37528
|
+
}
|
|
37529
|
+
}
|
|
37530
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
37531
|
+
}
|
|
37543
37532
|
};
|
|
37544
37533
|
function buildOutputSchema() {
|
|
37545
37534
|
return [
|
|
@@ -37553,6 +37542,29 @@ function buildOutputSchema() {
|
|
|
37553
37542
|
"}"
|
|
37554
37543
|
].join("\n");
|
|
37555
37544
|
}
|
|
37545
|
+
function buildRubricOutputSchema() {
|
|
37546
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37547
|
+
You must return a valid JSON object matching this schema:
|
|
37548
|
+
{
|
|
37549
|
+
"checks": [
|
|
37550
|
+
{
|
|
37551
|
+
"id": "string (rubric id)",
|
|
37552
|
+
"satisfied": boolean,
|
|
37553
|
+
"reasoning": "string (brief explanation)"
|
|
37554
|
+
}
|
|
37555
|
+
],
|
|
37556
|
+
"overall_reasoning": "string (summary)"
|
|
37557
|
+
}`;
|
|
37558
|
+
}
|
|
37559
|
+
function scoreToVerdict(score) {
|
|
37560
|
+
if (score >= 0.8) {
|
|
37561
|
+
return "pass";
|
|
37562
|
+
}
|
|
37563
|
+
if (score >= 0.6) {
|
|
37564
|
+
return "borderline";
|
|
37565
|
+
}
|
|
37566
|
+
return "fail";
|
|
37567
|
+
}
|
|
37556
37568
|
function clampScore(value) {
|
|
37557
37569
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
37558
37570
|
return 0;
|
|
@@ -37565,71 +37577,15 @@ function clampScore(value) {
|
|
|
37565
37577
|
}
|
|
37566
37578
|
return value;
|
|
37567
37579
|
}
|
|
37568
|
-
function parseQualityResponse(response) {
|
|
37569
|
-
const text2 = typeof response.text === "string" ? response.text.trim() : "";
|
|
37570
|
-
if (text2.length === 0) {
|
|
37571
|
-
return {};
|
|
37572
|
-
}
|
|
37573
|
-
const direct = attemptParseJson(text2);
|
|
37574
|
-
if (direct && validateQualityJson(direct)) {
|
|
37575
|
-
return direct;
|
|
37576
|
-
}
|
|
37577
|
-
const extracted = extractJsonBlob(text2);
|
|
37578
|
-
if (extracted) {
|
|
37579
|
-
const parsed = attemptParseJson(extracted);
|
|
37580
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
37581
|
-
return parsed;
|
|
37582
|
-
}
|
|
37583
|
-
}
|
|
37584
|
-
return {};
|
|
37585
|
-
}
|
|
37586
|
-
function attemptParseJson(text2) {
|
|
37587
|
-
try {
|
|
37588
|
-
const parsed = JSON.parse(text2);
|
|
37589
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
37590
|
-
const hits = parsed.hits;
|
|
37591
|
-
const misses = parsed.misses;
|
|
37592
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37593
|
-
return { score, hits, misses, reasoning };
|
|
37594
|
-
} catch {
|
|
37595
|
-
return void 0;
|
|
37596
|
-
}
|
|
37597
|
-
}
|
|
37598
|
-
function validateQualityJson(parsed) {
|
|
37599
|
-
if (typeof parsed.score !== "number") {
|
|
37600
|
-
return false;
|
|
37601
|
-
}
|
|
37602
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
37603
|
-
return false;
|
|
37604
|
-
}
|
|
37605
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
37606
|
-
return false;
|
|
37607
|
-
}
|
|
37608
|
-
if (parsed.hits !== void 0) {
|
|
37609
|
-
if (!Array.isArray(parsed.hits)) {
|
|
37610
|
-
return false;
|
|
37611
|
-
}
|
|
37612
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
37613
|
-
return false;
|
|
37614
|
-
}
|
|
37615
|
-
}
|
|
37616
|
-
if (parsed.misses !== void 0) {
|
|
37617
|
-
if (!Array.isArray(parsed.misses)) {
|
|
37618
|
-
return false;
|
|
37619
|
-
}
|
|
37620
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
37621
|
-
return false;
|
|
37622
|
-
}
|
|
37623
|
-
}
|
|
37624
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
37625
|
-
return false;
|
|
37626
|
-
}
|
|
37627
|
-
return true;
|
|
37628
|
-
}
|
|
37629
37580
|
function extractJsonBlob(text2) {
|
|
37630
37581
|
const match = text2.match(/\{[\s\S]*\}/);
|
|
37631
37582
|
return match?.[0];
|
|
37632
37583
|
}
|
|
37584
|
+
function parseJsonFromText(text2) {
|
|
37585
|
+
const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
|
|
37586
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
37587
|
+
return JSON.parse(blob);
|
|
37588
|
+
}
|
|
37633
37589
|
function isNonEmptyString(value) {
|
|
37634
37590
|
return typeof value === "string" && value.trim().length > 0;
|
|
37635
37591
|
}
|
|
@@ -37666,6 +37622,7 @@ var CodeEvaluator = class {
|
|
|
37666
37622
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37667
37623
|
return {
|
|
37668
37624
|
score,
|
|
37625
|
+
verdict: scoreToVerdict(score),
|
|
37669
37626
|
hits,
|
|
37670
37627
|
misses,
|
|
37671
37628
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -37679,6 +37636,7 @@ var CodeEvaluator = class {
|
|
|
37679
37636
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37680
37637
|
return {
|
|
37681
37638
|
score: 0,
|
|
37639
|
+
verdict: "fail",
|
|
37682
37640
|
hits: [],
|
|
37683
37641
|
misses: [`Code evaluator failed: ${message}`],
|
|
37684
37642
|
expectedAspectCount: 1,
|
|
@@ -37692,6 +37650,33 @@ var CodeEvaluator = class {
|
|
|
37692
37650
|
}
|
|
37693
37651
|
}
|
|
37694
37652
|
};
|
|
37653
|
+
function calculateRubricScore(result, rubrics) {
|
|
37654
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
37655
|
+
const hits = [];
|
|
37656
|
+
const misses = [];
|
|
37657
|
+
let totalWeight = 0;
|
|
37658
|
+
let earnedWeight = 0;
|
|
37659
|
+
let failedRequired = false;
|
|
37660
|
+
for (const check2 of result.checks) {
|
|
37661
|
+
const rubric = rubricMap.get(check2.id);
|
|
37662
|
+
if (!rubric) {
|
|
37663
|
+
continue;
|
|
37664
|
+
}
|
|
37665
|
+
totalWeight += rubric.weight;
|
|
37666
|
+
if (check2.satisfied) {
|
|
37667
|
+
earnedWeight += rubric.weight;
|
|
37668
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37669
|
+
} else {
|
|
37670
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37671
|
+
if (rubric.required) {
|
|
37672
|
+
failedRequired = true;
|
|
37673
|
+
}
|
|
37674
|
+
}
|
|
37675
|
+
}
|
|
37676
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37677
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
37678
|
+
return { score, verdict, hits, misses };
|
|
37679
|
+
}
|
|
37695
37680
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
37696
37681
|
const { spawn: spawn22 } = await import("node:child_process");
|
|
37697
37682
|
return await new Promise((resolve2, reject) => {
|
|
@@ -37821,7 +37806,7 @@ function pLimit(concurrency) {
|
|
|
37821
37806
|
activeCount--;
|
|
37822
37807
|
resumeNext();
|
|
37823
37808
|
};
|
|
37824
|
-
const
|
|
37809
|
+
const run2 = async (function_, resolve2, arguments_) => {
|
|
37825
37810
|
const result = (async () => function_(...arguments_))();
|
|
37826
37811
|
resolve2(result);
|
|
37827
37812
|
try {
|
|
@@ -37834,7 +37819,7 @@ function pLimit(concurrency) {
|
|
|
37834
37819
|
new Promise((internalResolve) => {
|
|
37835
37820
|
queue.enqueue(internalResolve);
|
|
37836
37821
|
}).then(
|
|
37837
|
-
|
|
37822
|
+
run2.bind(void 0, function_, resolve2, arguments_)
|
|
37838
37823
|
);
|
|
37839
37824
|
(async () => {
|
|
37840
37825
|
await Promise.resolve();
|
|
@@ -38417,7 +38402,6 @@ async function runEvaluatorList(options) {
|
|
|
38417
38402
|
reasoning: score2.reasoning,
|
|
38418
38403
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38419
38404
|
});
|
|
38420
|
-
continue;
|
|
38421
38405
|
}
|
|
38422
38406
|
if (evaluator.type === "code") {
|
|
38423
38407
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -38445,44 +38429,12 @@ async function runEvaluatorList(options) {
|
|
|
38445
38429
|
reasoning: score2.reasoning,
|
|
38446
38430
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38447
38431
|
});
|
|
38448
|
-
continue;
|
|
38449
|
-
}
|
|
38450
|
-
if (evaluator.type === "rubric") {
|
|
38451
|
-
const rubricEvaluator = new RubricEvaluator({
|
|
38452
|
-
config: evaluator,
|
|
38453
|
-
resolveJudgeProvider: async (context) => {
|
|
38454
|
-
if (context.judgeProvider) {
|
|
38455
|
-
return context.judgeProvider;
|
|
38456
|
-
}
|
|
38457
|
-
return judgeProvider;
|
|
38458
|
-
}
|
|
38459
|
-
});
|
|
38460
|
-
const score2 = await rubricEvaluator.evaluate({
|
|
38461
|
-
evalCase,
|
|
38462
|
-
candidate,
|
|
38463
|
-
target,
|
|
38464
|
-
provider,
|
|
38465
|
-
attempt,
|
|
38466
|
-
promptInputs,
|
|
38467
|
-
now,
|
|
38468
|
-
judgeProvider
|
|
38469
|
-
});
|
|
38470
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
38471
|
-
evaluatorResults.push({
|
|
38472
|
-
name: evaluator.name,
|
|
38473
|
-
type: evaluator.type,
|
|
38474
|
-
score: score2.score,
|
|
38475
|
-
verdict: score2.verdict,
|
|
38476
|
-
hits: score2.hits,
|
|
38477
|
-
misses: score2.misses,
|
|
38478
|
-
reasoning: score2.reasoning,
|
|
38479
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38480
|
-
});
|
|
38481
38432
|
}
|
|
38482
38433
|
} catch (error40) {
|
|
38483
38434
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38484
38435
|
const fallbackScore = {
|
|
38485
38436
|
score: 0,
|
|
38437
|
+
verdict: "fail",
|
|
38486
38438
|
hits: [],
|
|
38487
38439
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
38488
38440
|
expectedAspectCount: 1,
|
|
@@ -38497,6 +38449,7 @@ async function runEvaluatorList(options) {
|
|
|
38497
38449
|
name: evaluator.name ?? "unknown",
|
|
38498
38450
|
type: evaluator.type ?? "unknown",
|
|
38499
38451
|
score: 0,
|
|
38452
|
+
verdict: "fail",
|
|
38500
38453
|
hits: [],
|
|
38501
38454
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
38502
38455
|
reasoning: message
|
|
@@ -38515,6 +38468,7 @@ async function runEvaluatorList(options) {
|
|
|
38515
38468
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
38516
38469
|
const score = {
|
|
38517
38470
|
score: aggregateScore,
|
|
38471
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
38518
38472
|
hits,
|
|
38519
38473
|
misses,
|
|
38520
38474
|
expectedAspectCount,
|
|
@@ -38565,6 +38519,15 @@ async function resolveCustomPrompt(config2) {
|
|
|
38565
38519
|
function isNonEmptyString2(value) {
|
|
38566
38520
|
return typeof value === "string" && value.trim().length > 0;
|
|
38567
38521
|
}
|
|
38522
|
+
function scoreToVerdict2(score) {
|
|
38523
|
+
if (score >= 0.8) {
|
|
38524
|
+
return "pass";
|
|
38525
|
+
}
|
|
38526
|
+
if (score >= 0.6) {
|
|
38527
|
+
return "borderline";
|
|
38528
|
+
}
|
|
38529
|
+
return "fail";
|
|
38530
|
+
}
|
|
38568
38531
|
function filterEvalCases(evalCases, evalId) {
|
|
38569
38532
|
if (!evalId) {
|
|
38570
38533
|
return evalCases;
|
|
@@ -38778,9 +38741,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
38778
38741
|
}
|
|
38779
38742
|
return parts.join("\n");
|
|
38780
38743
|
}
|
|
38781
|
-
function createAgentKernel() {
|
|
38782
|
-
return { status: "stub" };
|
|
38783
|
-
}
|
|
38784
38744
|
|
|
38785
38745
|
// src/commands/eval/env.ts
|
|
38786
38746
|
import { constants as constants4 } from "node:fs";
|
|
@@ -39513,17 +39473,18 @@ function formatEvaluationSummary(summary) {
|
|
|
39513
39473
|
|
|
39514
39474
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
39515
39475
|
import { readFile as readFile7 } from "node:fs/promises";
|
|
39476
|
+
import path16 from "node:path";
|
|
39516
39477
|
import { parse as parse6 } from "yaml";
|
|
39517
39478
|
import { readFile as readFile23 } from "node:fs/promises";
|
|
39518
|
-
import
|
|
39479
|
+
import path23 from "node:path";
|
|
39519
39480
|
import { parse as parse23 } from "yaml";
|
|
39520
39481
|
import { readFile as readFile33 } from "node:fs/promises";
|
|
39521
|
-
import
|
|
39482
|
+
import path33 from "node:path";
|
|
39522
39483
|
import { parse as parse33 } from "yaml";
|
|
39523
39484
|
import { readFile as readFile43 } from "node:fs/promises";
|
|
39524
39485
|
import { parse as parse42 } from "yaml";
|
|
39525
39486
|
import { readFile as readFile52 } from "node:fs/promises";
|
|
39526
|
-
import
|
|
39487
|
+
import path43 from "node:path";
|
|
39527
39488
|
import { parse as parse52 } from "yaml";
|
|
39528
39489
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
39529
39490
|
var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
@@ -39533,12 +39494,12 @@ async function detectFileType(filePath) {
|
|
|
39533
39494
|
const content = await readFile7(filePath, "utf8");
|
|
39534
39495
|
const parsed = parse6(content);
|
|
39535
39496
|
if (typeof parsed !== "object" || parsed === null) {
|
|
39536
|
-
return
|
|
39497
|
+
return inferFileTypeFromPath(filePath);
|
|
39537
39498
|
}
|
|
39538
39499
|
const record2 = parsed;
|
|
39539
39500
|
const schema = record2.$schema;
|
|
39540
39501
|
if (typeof schema !== "string") {
|
|
39541
|
-
return
|
|
39502
|
+
return inferFileTypeFromPath(filePath);
|
|
39542
39503
|
}
|
|
39543
39504
|
switch (schema) {
|
|
39544
39505
|
case SCHEMA_EVAL_V2:
|
|
@@ -39548,18 +39509,31 @@ async function detectFileType(filePath) {
|
|
|
39548
39509
|
case SCHEMA_CONFIG_V22:
|
|
39549
39510
|
return "config";
|
|
39550
39511
|
default:
|
|
39551
|
-
return
|
|
39512
|
+
return inferFileTypeFromPath(filePath);
|
|
39552
39513
|
}
|
|
39553
39514
|
} catch {
|
|
39554
|
-
return
|
|
39515
|
+
return inferFileTypeFromPath(filePath);
|
|
39555
39516
|
}
|
|
39556
39517
|
}
|
|
39518
|
+
function inferFileTypeFromPath(filePath) {
|
|
39519
|
+
const normalized = path16.normalize(filePath).replace(/\\/g, "/");
|
|
39520
|
+
const basename = path16.basename(filePath);
|
|
39521
|
+
if (normalized.includes("/.agentv/")) {
|
|
39522
|
+
if (basename === "config.yaml" || basename === "config.yml") {
|
|
39523
|
+
return "config";
|
|
39524
|
+
}
|
|
39525
|
+
if (basename === "targets.yaml" || basename === "targets.yml") {
|
|
39526
|
+
return "targets";
|
|
39527
|
+
}
|
|
39528
|
+
}
|
|
39529
|
+
return "eval";
|
|
39530
|
+
}
|
|
39557
39531
|
function isObject2(value) {
|
|
39558
39532
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
39559
39533
|
}
|
|
39560
39534
|
async function validateEvalFile(filePath) {
|
|
39561
39535
|
const errors = [];
|
|
39562
|
-
const absolutePath =
|
|
39536
|
+
const absolutePath = path23.resolve(filePath);
|
|
39563
39537
|
let parsed;
|
|
39564
39538
|
try {
|
|
39565
39539
|
const content = await readFile23(absolutePath, "utf8");
|
|
@@ -39908,7 +39882,7 @@ function validateUnknownSettings(target, provider, absolutePath, location, error
|
|
|
39908
39882
|
}
|
|
39909
39883
|
async function validateTargetsFile(filePath) {
|
|
39910
39884
|
const errors = [];
|
|
39911
|
-
const absolutePath =
|
|
39885
|
+
const absolutePath = path33.resolve(filePath);
|
|
39912
39886
|
let parsed;
|
|
39913
39887
|
try {
|
|
39914
39888
|
const content = await readFile33(absolutePath, "utf8");
|
|
@@ -40187,8 +40161,8 @@ async function validateConfigFile(filePath) {
|
|
|
40187
40161
|
}
|
|
40188
40162
|
const config2 = parsed;
|
|
40189
40163
|
const schema = config2.$schema;
|
|
40190
|
-
if (schema !== SCHEMA_CONFIG_V222) {
|
|
40191
|
-
const message =
|
|
40164
|
+
if (schema !== void 0 && schema !== SCHEMA_CONFIG_V222) {
|
|
40165
|
+
const message = `Invalid $schema value '${schema}'. Expected '${SCHEMA_CONFIG_V222}' or omit the field.`;
|
|
40192
40166
|
errors.push({
|
|
40193
40167
|
severity: "error",
|
|
40194
40168
|
filePath,
|
|
@@ -40250,7 +40224,7 @@ function isObject3(value) {
|
|
|
40250
40224
|
}
|
|
40251
40225
|
async function validateFileReferences(evalFilePath) {
|
|
40252
40226
|
const errors = [];
|
|
40253
|
-
const absolutePath =
|
|
40227
|
+
const absolutePath = path43.resolve(evalFilePath);
|
|
40254
40228
|
const gitRoot = await findGitRoot(absolutePath);
|
|
40255
40229
|
if (!gitRoot) {
|
|
40256
40230
|
errors.push({
|
|
@@ -40607,12 +40581,12 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
40607
40581
|
const extension = getDefaultExtension(format);
|
|
40608
40582
|
return path18.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
40609
40583
|
}
|
|
40610
|
-
function resolvePromptDirectory(
|
|
40611
|
-
if (
|
|
40584
|
+
function resolvePromptDirectory(option4, cwd) {
|
|
40585
|
+
if (option4 === void 0) {
|
|
40612
40586
|
return void 0;
|
|
40613
40587
|
}
|
|
40614
|
-
if (typeof
|
|
40615
|
-
return path18.resolve(cwd,
|
|
40588
|
+
if (typeof option4 === "string" && option4.trim().length > 0) {
|
|
40589
|
+
return path18.resolve(cwd, option4);
|
|
40616
40590
|
}
|
|
40617
40591
|
return path18.join(cwd, ".agentv", "prompts");
|
|
40618
40592
|
}
|
|
@@ -40916,56 +40890,119 @@ async function resolveEvaluationRunner() {
|
|
|
40916
40890
|
}
|
|
40917
40891
|
|
|
40918
40892
|
// src/commands/eval/index.ts
|
|
40919
|
-
|
|
40920
|
-
|
|
40921
|
-
|
|
40922
|
-
|
|
40923
|
-
|
|
40924
|
-
|
|
40925
|
-
|
|
40926
|
-
|
|
40927
|
-
|
|
40928
|
-
|
|
40929
|
-
|
|
40930
|
-
|
|
40931
|
-
|
|
40932
|
-
|
|
40933
|
-
|
|
40934
|
-
|
|
40935
|
-
|
|
40936
|
-
|
|
40937
|
-
|
|
40938
|
-
|
|
40939
|
-
|
|
40940
|
-
|
|
40941
|
-
|
|
40942
|
-
|
|
40943
|
-
|
|
40944
|
-
|
|
40945
|
-
|
|
40946
|
-
|
|
40947
|
-
|
|
40948
|
-
|
|
40949
|
-
|
|
40950
|
-
|
|
40951
|
-
|
|
40952
|
-
|
|
40953
|
-
|
|
40954
|
-
|
|
40955
|
-
|
|
40956
|
-
|
|
40957
|
-
|
|
40958
|
-
|
|
40959
|
-
|
|
40960
|
-
|
|
40961
|
-
|
|
40962
|
-
|
|
40963
|
-
|
|
40964
|
-
|
|
40893
|
+
var evalCommand = command({
|
|
40894
|
+
name: "eval",
|
|
40895
|
+
description: "Run eval suites and report results",
|
|
40896
|
+
args: {
|
|
40897
|
+
evalPaths: restPositionals({
|
|
40898
|
+
type: string4,
|
|
40899
|
+
displayName: "eval-paths",
|
|
40900
|
+
description: "Path(s) or glob(s) to evaluation .yaml file(s)"
|
|
40901
|
+
}),
|
|
40902
|
+
target: option({
|
|
40903
|
+
type: string4,
|
|
40904
|
+
long: "target",
|
|
40905
|
+
description: "Override target name from targets.yaml",
|
|
40906
|
+
defaultValue: () => "default"
|
|
40907
|
+
}),
|
|
40908
|
+
targets: option({
|
|
40909
|
+
type: optional2(string4),
|
|
40910
|
+
long: "targets",
|
|
40911
|
+
description: "Path to targets.yaml (overrides discovery)"
|
|
40912
|
+
}),
|
|
40913
|
+
evalId: option({
|
|
40914
|
+
type: optional2(string4),
|
|
40915
|
+
long: "eval-id",
|
|
40916
|
+
description: "Run only the eval case with this identifier"
|
|
40917
|
+
}),
|
|
40918
|
+
workers: option({
|
|
40919
|
+
type: number4,
|
|
40920
|
+
long: "workers",
|
|
40921
|
+
description: "Number of parallel workers (default: 1, max: 50). Can also be set per-target in targets.yaml",
|
|
40922
|
+
defaultValue: () => 1
|
|
40923
|
+
}),
|
|
40924
|
+
out: option({
|
|
40925
|
+
type: optional2(string4),
|
|
40926
|
+
long: "out",
|
|
40927
|
+
description: "Write results to the specified path"
|
|
40928
|
+
}),
|
|
40929
|
+
outputFormat: option({
|
|
40930
|
+
type: string4,
|
|
40931
|
+
long: "output-format",
|
|
40932
|
+
description: "Output format: 'jsonl' or 'yaml' (default: jsonl)",
|
|
40933
|
+
defaultValue: () => "jsonl"
|
|
40934
|
+
}),
|
|
40935
|
+
dryRun: flag({
|
|
40936
|
+
long: "dry-run",
|
|
40937
|
+
description: "Use mock provider responses instead of real LLM calls"
|
|
40938
|
+
}),
|
|
40939
|
+
dryRunDelay: option({
|
|
40940
|
+
type: number4,
|
|
40941
|
+
long: "dry-run-delay",
|
|
40942
|
+
description: "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
|
|
40943
|
+
defaultValue: () => 0
|
|
40944
|
+
}),
|
|
40945
|
+
dryRunDelayMin: option({
|
|
40946
|
+
type: number4,
|
|
40947
|
+
long: "dry-run-delay-min",
|
|
40948
|
+
description: "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
|
|
40949
|
+
defaultValue: () => 0
|
|
40950
|
+
}),
|
|
40951
|
+
dryRunDelayMax: option({
|
|
40952
|
+
type: number4,
|
|
40953
|
+
long: "dry-run-delay-max",
|
|
40954
|
+
description: "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
|
|
40955
|
+
defaultValue: () => 0
|
|
40956
|
+
}),
|
|
40957
|
+
agentTimeout: option({
|
|
40958
|
+
type: number4,
|
|
40959
|
+
long: "agent-timeout",
|
|
40960
|
+
description: "Timeout in seconds for provider responses (default: 120)",
|
|
40961
|
+
defaultValue: () => 120
|
|
40962
|
+
}),
|
|
40963
|
+
maxRetries: option({
|
|
40964
|
+
type: number4,
|
|
40965
|
+
long: "max-retries",
|
|
40966
|
+
description: "Retry count for timeout recoveries (default: 2)",
|
|
40967
|
+
defaultValue: () => 2
|
|
40968
|
+
}),
|
|
40969
|
+
cache: flag({
|
|
40970
|
+
long: "cache",
|
|
40971
|
+
description: "Enable in-memory provider response cache"
|
|
40972
|
+
}),
|
|
40973
|
+
verbose: flag({
|
|
40974
|
+
long: "verbose",
|
|
40975
|
+
description: "Enable verbose logging"
|
|
40976
|
+
}),
|
|
40977
|
+
dumpPrompts: option({
|
|
40978
|
+
type: optional2(string4),
|
|
40979
|
+
long: "dump-prompts",
|
|
40980
|
+
description: "Directory path for persisting prompt payloads for debugging"
|
|
40981
|
+
})
|
|
40982
|
+
},
|
|
40983
|
+
handler: async (args) => {
|
|
40984
|
+
const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
|
|
40985
|
+
const dumpPrompts = args.dumpPrompts !== void 0 ? args.dumpPrompts === "." ? true : args.dumpPrompts : void 0;
|
|
40986
|
+
const rawOptions = {
|
|
40987
|
+
target: args.target,
|
|
40988
|
+
targets: args.targets,
|
|
40989
|
+
evalId: args.evalId,
|
|
40990
|
+
workers: args.workers,
|
|
40991
|
+
out: args.out,
|
|
40992
|
+
outputFormat: args.outputFormat,
|
|
40993
|
+
dryRun: args.dryRun,
|
|
40994
|
+
dryRunDelay: args.dryRunDelay,
|
|
40995
|
+
dryRunDelayMin: args.dryRunDelayMin,
|
|
40996
|
+
dryRunDelayMax: args.dryRunDelayMax,
|
|
40997
|
+
agentTimeout: args.agentTimeout,
|
|
40998
|
+
maxRetries: args.maxRetries,
|
|
40999
|
+
cache: args.cache,
|
|
41000
|
+
verbose: args.verbose,
|
|
41001
|
+
dumpPrompts
|
|
41002
|
+
};
|
|
40965
41003
|
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
40966
|
-
}
|
|
40967
|
-
|
|
40968
|
-
}
|
|
41004
|
+
}
|
|
41005
|
+
});
|
|
40969
41006
|
async function resolveEvalPaths(evalPaths, cwd) {
|
|
40970
41007
|
const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
|
|
40971
41008
|
if (normalizedInputs.length === 0) {
|
|
@@ -41013,6 +41050,9 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41013
41050
|
return sorted;
|
|
41014
41051
|
}
|
|
41015
41052
|
|
|
41053
|
+
// src/commands/generate/index.ts
|
|
41054
|
+
import { command as command2, flag as flag2, option as option2, optional as optional3, positional as positional2, string as string5, subcommands } from "cmd-ts";
|
|
41055
|
+
|
|
41016
41056
|
// src/commands/generate/rubrics.ts
|
|
41017
41057
|
import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
|
|
41018
41058
|
import path20 from "node:path";
|
|
@@ -41154,29 +41194,53 @@ function extractQuestion(evalCase) {
|
|
|
41154
41194
|
}
|
|
41155
41195
|
|
|
41156
41196
|
// src/commands/generate/index.ts
|
|
41157
|
-
|
|
41158
|
-
|
|
41159
|
-
|
|
41160
|
-
|
|
41161
|
-
|
|
41162
|
-
|
|
41197
|
+
var rubricsCommand = command2({
|
|
41198
|
+
name: "rubrics",
|
|
41199
|
+
description: "Generate rubrics from expected_outcome in YAML eval file",
|
|
41200
|
+
args: {
|
|
41201
|
+
file: positional2({
|
|
41202
|
+
type: string5,
|
|
41203
|
+
displayName: "file",
|
|
41204
|
+
description: "Path to YAML eval file"
|
|
41205
|
+
}),
|
|
41206
|
+
target: option2({
|
|
41207
|
+
type: optional3(string5),
|
|
41208
|
+
long: "target",
|
|
41209
|
+
short: "t",
|
|
41210
|
+
description: "Override target for rubric generation (default: file target or openai:gpt-4o)"
|
|
41211
|
+
}),
|
|
41212
|
+
verbose: flag2({
|
|
41213
|
+
long: "verbose",
|
|
41214
|
+
short: "v",
|
|
41215
|
+
description: "Show detailed progress"
|
|
41216
|
+
})
|
|
41217
|
+
},
|
|
41218
|
+
handler: async ({ file: file2, target, verbose }) => {
|
|
41163
41219
|
try {
|
|
41164
41220
|
await generateRubricsCommand({
|
|
41165
41221
|
file: file2,
|
|
41166
|
-
target
|
|
41167
|
-
verbose
|
|
41222
|
+
target,
|
|
41223
|
+
verbose
|
|
41168
41224
|
});
|
|
41169
41225
|
} catch (error40) {
|
|
41170
41226
|
console.error(`Error: ${error40.message}`);
|
|
41171
41227
|
process.exit(1);
|
|
41172
41228
|
}
|
|
41173
|
-
}
|
|
41174
|
-
}
|
|
41229
|
+
}
|
|
41230
|
+
});
|
|
41231
|
+
var generateCommand = subcommands({
|
|
41232
|
+
name: "generate",
|
|
41233
|
+
description: "Generate evaluation artifacts",
|
|
41234
|
+
cmds: {
|
|
41235
|
+
rubrics: rubricsCommand
|
|
41236
|
+
}
|
|
41237
|
+
});
|
|
41175
41238
|
|
|
41176
41239
|
// src/commands/init/index.ts
|
|
41177
41240
|
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
41178
41241
|
import path24 from "node:path";
|
|
41179
41242
|
import * as readline from "node:readline/promises";
|
|
41243
|
+
import { command as command3, option as option3, optional as optional4, string as string6 } from "cmd-ts";
|
|
41180
41244
|
|
|
41181
41245
|
// src/templates/index.ts
|
|
41182
41246
|
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
@@ -41355,15 +41419,28 @@ Files installed to ${path24.relative(targetPath, claudeDir)}:`);
|
|
|
41355
41419
|
console.log(" 2. Configure targets in .agentv/targets.yaml");
|
|
41356
41420
|
console.log(" 3. Create eval files using the schema and prompt templates");
|
|
41357
41421
|
}
|
|
41422
|
+
var initCmdTsCommand = command3({
|
|
41423
|
+
name: "init",
|
|
41424
|
+
description: "Initialize AgentV in your project (installs prompt templates and schema to .github)",
|
|
41425
|
+
args: {
|
|
41426
|
+
path: option3({
|
|
41427
|
+
type: optional4(string6),
|
|
41428
|
+
long: "path",
|
|
41429
|
+
description: "Target directory for initialization (default: current directory)"
|
|
41430
|
+
})
|
|
41431
|
+
},
|
|
41432
|
+
handler: async ({ path: targetPath }) => {
|
|
41433
|
+
try {
|
|
41434
|
+
await initCommand({ targetPath });
|
|
41435
|
+
} catch (error40) {
|
|
41436
|
+
console.error(`Error: ${error40.message}`);
|
|
41437
|
+
process.exit(1);
|
|
41438
|
+
}
|
|
41439
|
+
}
|
|
41440
|
+
});
|
|
41358
41441
|
|
|
41359
|
-
// src/commands/
|
|
41360
|
-
|
|
41361
|
-
program.command("status").description("Show the latest AgentV kernel status").action(() => {
|
|
41362
|
-
const kernel = createAgentKernel();
|
|
41363
|
-
console.log(`Kernel status: ${kernel.status}`);
|
|
41364
|
-
});
|
|
41365
|
-
return program;
|
|
41366
|
-
}
|
|
41442
|
+
// src/commands/validate/index.ts
|
|
41443
|
+
import { command as command4, restPositionals as restPositionals2, string as string7 } from "cmd-ts";
|
|
41367
41444
|
|
|
41368
41445
|
// src/commands/validate/format-output.ts
|
|
41369
41446
|
var ANSI_RED3 = "\x1B[31m";
|
|
@@ -41468,20 +41545,6 @@ async function validateFiles(paths) {
|
|
|
41468
41545
|
async function validateSingleFile(filePath) {
|
|
41469
41546
|
const absolutePath = path25.resolve(filePath);
|
|
41470
41547
|
const fileType = await detectFileType(absolutePath);
|
|
41471
|
-
if (fileType === "unknown") {
|
|
41472
|
-
return {
|
|
41473
|
-
valid: false,
|
|
41474
|
-
filePath: absolutePath,
|
|
41475
|
-
fileType: "unknown",
|
|
41476
|
-
errors: [
|
|
41477
|
-
{
|
|
41478
|
-
severity: "error",
|
|
41479
|
-
filePath: absolutePath,
|
|
41480
|
-
message: "Missing or invalid $schema field. File must declare schema: 'agentv-eval-v2', 'agentv-targets-v2', or 'agentv-config-v2'"
|
|
41481
|
-
}
|
|
41482
|
-
]
|
|
41483
|
-
};
|
|
41484
|
-
}
|
|
41485
41548
|
let result;
|
|
41486
41549
|
if (fileType === "eval") {
|
|
41487
41550
|
result = await validateEvalFile(absolutePath);
|
|
@@ -41551,7 +41614,7 @@ function isYamlFile(filePath) {
|
|
|
41551
41614
|
}
|
|
41552
41615
|
|
|
41553
41616
|
// src/commands/validate/index.ts
|
|
41554
|
-
async function runValidateCommand(paths
|
|
41617
|
+
async function runValidateCommand(paths) {
|
|
41555
41618
|
if (paths.length === 0) {
|
|
41556
41619
|
console.error("Error: No paths specified. Usage: agentv validate <paths...>");
|
|
41557
41620
|
process.exit(1);
|
|
@@ -41563,47 +41626,45 @@ async function runValidateCommand(paths, _options) {
|
|
|
41563
41626
|
process.exit(1);
|
|
41564
41627
|
}
|
|
41565
41628
|
}
|
|
41566
|
-
|
|
41567
|
-
|
|
41629
|
+
var validateCommand = command4({
|
|
41630
|
+
name: "validate",
|
|
41631
|
+
description: "Validate AgentV eval and targets YAML files",
|
|
41632
|
+
args: {
|
|
41633
|
+
paths: restPositionals2({
|
|
41634
|
+
type: string7,
|
|
41635
|
+
displayName: "paths",
|
|
41636
|
+
description: "Files or directories to validate"
|
|
41637
|
+
})
|
|
41638
|
+
},
|
|
41639
|
+
handler: async ({ paths }) => {
|
|
41568
41640
|
try {
|
|
41569
|
-
await runValidateCommand(paths
|
|
41641
|
+
await runValidateCommand(paths);
|
|
41570
41642
|
} catch (error40) {
|
|
41571
41643
|
console.error(`Error: ${error40.message}`);
|
|
41572
41644
|
process.exit(1);
|
|
41573
41645
|
}
|
|
41574
|
-
}
|
|
41575
|
-
|
|
41576
|
-
}
|
|
41646
|
+
}
|
|
41647
|
+
});
|
|
41577
41648
|
|
|
41578
41649
|
// src/index.ts
|
|
41579
41650
|
var packageJson = JSON.parse(readFileSync2(new URL("../package.json", import.meta.url), "utf8"));
|
|
41580
|
-
|
|
41581
|
-
|
|
41582
|
-
|
|
41583
|
-
|
|
41584
|
-
|
|
41585
|
-
|
|
41586
|
-
|
|
41587
|
-
|
|
41588
|
-
|
|
41589
|
-
|
|
41590
|
-
|
|
41591
|
-
await initCommand({ targetPath });
|
|
41592
|
-
} catch (error40) {
|
|
41593
|
-
console.error(`Error: ${error40.message}`);
|
|
41594
|
-
process.exit(1);
|
|
41595
|
-
}
|
|
41596
|
-
});
|
|
41597
|
-
return program;
|
|
41598
|
-
}
|
|
41651
|
+
var app = subcommands2({
|
|
41652
|
+
name: "agentv",
|
|
41653
|
+
description: "AgentV CLI",
|
|
41654
|
+
version: packageJson.version,
|
|
41655
|
+
cmds: {
|
|
41656
|
+
eval: evalCommand,
|
|
41657
|
+
validate: validateCommand,
|
|
41658
|
+
generate: generateCommand,
|
|
41659
|
+
init: initCmdTsCommand
|
|
41660
|
+
}
|
|
41661
|
+
});
|
|
41599
41662
|
async function runCli(argv = process.argv) {
|
|
41600
|
-
|
|
41601
|
-
await program.parseAsync(argv);
|
|
41602
|
-
return program;
|
|
41663
|
+
await run(binary(app), argv);
|
|
41603
41664
|
}
|
|
41604
41665
|
|
|
41605
41666
|
export {
|
|
41606
|
-
|
|
41667
|
+
app,
|
|
41607
41668
|
runCli
|
|
41608
41669
|
};
|
|
41609
|
-
//# sourceMappingURL=chunk-
|
|
41670
|
+
//# sourceMappingURL=chunk-A5T7W63L.js.map
|