agentv 0.21.0 → 0.21.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -142,11 +142,20 @@ var require_dist = __commonJS({
142
142
 
143
143
  // src/index.ts
144
144
  import { readFileSync as readFileSync2 } from "node:fs";
145
- import { Command } from "commander";
145
+ import { binary, run, subcommands as subcommands2 } from "cmd-ts";
146
146
 
147
147
  // src/commands/eval/index.ts
148
148
  import { stat as stat4 } from "node:fs/promises";
149
149
  import path19 from "node:path";
150
+ import {
151
+ command,
152
+ flag,
153
+ number as number4,
154
+ option,
155
+ optional as optional2,
156
+ restPositionals,
157
+ string as string4
158
+ } from "cmd-ts";
150
159
  import fg from "fast-glob";
151
160
 
152
161
  // src/commands/eval/run-eval.ts
@@ -155,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
155
164
  import path18 from "node:path";
156
165
  import { pathToFileURL } from "node:url";
157
166
 
158
- // ../../packages/core/dist/chunk-BO7KG7JX.js
167
+ // ../../packages/core/dist/chunk-B2J23S7D.js
159
168
  import { constants } from "node:fs";
160
169
  import { access, readFile } from "node:fs/promises";
161
170
  import path from "node:path";
@@ -1039,8 +1048,8 @@ var ZodType = class {
1039
1048
  promise() {
1040
1049
  return ZodPromise.create(this, this._def);
1041
1050
  }
1042
- or(option) {
1043
- return ZodUnion.create([this, option], this._def);
1051
+ or(option4) {
1052
+ return ZodUnion.create([this, option4], this._def);
1044
1053
  }
1045
1054
  and(incoming) {
1046
1055
  return ZodIntersection.create(this, incoming, this._def);
@@ -2890,7 +2899,7 @@ var ZodUnion = class extends ZodType {
2890
2899
  return INVALID;
2891
2900
  }
2892
2901
  if (ctx.common.async) {
2893
- return Promise.all(options.map(async (option) => {
2902
+ return Promise.all(options.map(async (option4) => {
2894
2903
  const childCtx = {
2895
2904
  ...ctx,
2896
2905
  common: {
@@ -2900,7 +2909,7 @@ var ZodUnion = class extends ZodType {
2900
2909
  parent: null
2901
2910
  };
2902
2911
  return {
2903
- result: await option._parseAsync({
2912
+ result: await option4._parseAsync({
2904
2913
  data: ctx.data,
2905
2914
  path: ctx.path,
2906
2915
  parent: childCtx
@@ -2911,7 +2920,7 @@ var ZodUnion = class extends ZodType {
2911
2920
  } else {
2912
2921
  let dirty = void 0;
2913
2922
  const issues = [];
2914
- for (const option of options) {
2923
+ for (const option4 of options) {
2915
2924
  const childCtx = {
2916
2925
  ...ctx,
2917
2926
  common: {
@@ -2920,7 +2929,7 @@ var ZodUnion = class extends ZodType {
2920
2929
  },
2921
2930
  parent: null
2922
2931
  };
2923
- const result = option._parseSync({
2932
+ const result = option4._parseSync({
2924
2933
  data: ctx.data,
2925
2934
  path: ctx.path,
2926
2935
  parent: childCtx
@@ -3001,8 +3010,8 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
3001
3010
  }
3002
3011
  const discriminator = this.discriminator;
3003
3012
  const discriminatorValue = ctx.data[discriminator];
3004
- const option = this.optionsMap.get(discriminatorValue);
3005
- if (!option) {
3013
+ const option4 = this.optionsMap.get(discriminatorValue);
3014
+ if (!option4) {
3006
3015
  addIssueToContext(ctx, {
3007
3016
  code: ZodIssueCode.invalid_union_discriminator,
3008
3017
  options: Array.from(this.optionsMap.keys()),
@@ -3011,13 +3020,13 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
3011
3020
  return INVALID;
3012
3021
  }
3013
3022
  if (ctx.common.async) {
3014
- return option._parseAsync({
3023
+ return option4._parseAsync({
3015
3024
  data: ctx.data,
3016
3025
  path: ctx.path,
3017
3026
  parent: ctx
3018
3027
  });
3019
3028
  } else {
3020
- return option._parseSync({
3029
+ return option4._parseSync({
3021
3030
  data: ctx.data,
3022
3031
  path: ctx.path,
3023
3032
  parent: ctx
@@ -4201,7 +4210,7 @@ var coerce = {
4201
4210
  };
4202
4211
  var NEVER = INVALID;
4203
4212
 
4204
- // ../../packages/core/dist/chunk-BO7KG7JX.js
4213
+ // ../../packages/core/dist/chunk-B2J23S7D.js
4205
4214
  async function fileExists(filePath) {
4206
4215
  try {
4207
4216
  await access(filePath, constants.F_OK);
@@ -4577,9 +4586,9 @@ function resolveVSCodeConfig(target, env, insiders) {
4577
4586
  const dryRunSource = target.dry_run ?? target.dryRun;
4578
4587
  const subagentRootSource = target.subagent_root ?? target.subagentRoot;
4579
4588
  const defaultCommand = insiders ? "code-insiders" : "code";
4580
- const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
4589
+ const command5 = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
4581
4590
  return {
4582
- command,
4591
+ command: command5,
4583
4592
  waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
4584
4593
  dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
4585
4594
  subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
@@ -8081,7 +8090,7 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
8081
8090
  defineLazy(inst._zod, "optout", () => def.options.some((o) => o._zod.optout === "optional") ? "optional" : void 0);
8082
8091
  defineLazy(inst._zod, "values", () => {
8083
8092
  if (def.options.every((o) => o._zod.values)) {
8084
- return new Set(def.options.flatMap((option) => Array.from(option._zod.values)));
8093
+ return new Set(def.options.flatMap((option4) => Array.from(option4._zod.values)));
8085
8094
  }
8086
8095
  return void 0;
8087
8096
  });
@@ -8095,8 +8104,8 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
8095
8104
  inst._zod.parse = (payload, ctx) => {
8096
8105
  let async = false;
8097
8106
  const results = [];
8098
- for (const option of def.options) {
8099
- const result = option._zod.run({
8107
+ for (const option4 of def.options) {
8108
+ const result = option4._zod.run({
8100
8109
  value: payload.value,
8101
8110
  issues: []
8102
8111
  }, ctx);
@@ -8121,10 +8130,10 @@ var $ZodDiscriminatedUnion = /* @__PURE__ */ $constructor("$ZodDiscriminatedUnio
8121
8130
  const _super = inst._zod.parse;
8122
8131
  defineLazy(inst._zod, "propValues", () => {
8123
8132
  const propValues = {};
8124
- for (const option of def.options) {
8125
- const pv = option._zod.propValues;
8133
+ for (const option4 of def.options) {
8134
+ const pv = option4._zod.propValues;
8126
8135
  if (!pv || Object.keys(pv).length === 0)
8127
- throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option)}"`);
8136
+ throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option4)}"`);
8128
8137
  for (const [k, v] of Object.entries(pv)) {
8129
8138
  if (!propValues[k])
8130
8139
  propValues[k] = /* @__PURE__ */ new Set();
@@ -15328,8 +15337,8 @@ function isTransforming(_schema, _ctx) {
15328
15337
  return false;
15329
15338
  }
15330
15339
  case "union": {
15331
- for (const option of def.options) {
15332
- if (isTransforming(option, ctx))
15340
+ for (const option4 of def.options) {
15341
+ if (isTransforming(option4, ctx))
15333
15342
  return true;
15334
15343
  }
15335
15344
  return false;
@@ -34920,25 +34929,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34920
34929
  }
34921
34930
  }
34922
34931
  const _model = asString2(rawEvaluator.model);
34932
+ const rawRubrics = rawEvaluator.rubrics;
34933
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
34934
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
34935
+ description: asString2(rubric.description) ?? "",
34936
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
34937
+ required: typeof rubric.required === "boolean" ? rubric.required : true
34938
+ })).filter((r) => r.description.length > 0) : void 0;
34923
34939
  if (typeValue === "rubric") {
34924
- const rubrics = rawEvaluator.rubrics;
34925
- if (!Array.isArray(rubrics)) {
34940
+ if (!parsedRubrics) {
34926
34941
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
34927
34942
  continue;
34928
34943
  }
34929
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
34930
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
34931
- description: asString2(rubric.description) ?? "",
34932
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
34933
- required: typeof rubric.required === "boolean" ? rubric.required : true
34934
- })).filter((r) => r.description.length > 0);
34935
34944
  if (parsedRubrics.length === 0) {
34936
34945
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
34937
34946
  continue;
34938
34947
  }
34939
34948
  evaluators.push({
34940
34949
  name: name16,
34941
- type: "rubric",
34950
+ type: "llm_judge",
34942
34951
  rubrics: parsedRubrics
34943
34952
  });
34944
34953
  continue;
@@ -34947,7 +34956,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34947
34956
  name: name16,
34948
34957
  type: "llm_judge",
34949
34958
  prompt,
34950
- promptPath
34959
+ promptPath,
34960
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
34951
34961
  });
34952
34962
  }
34953
34963
  return evaluators.length > 0 ? evaluators : void 0;
@@ -35488,7 +35498,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35488
35498
  if (rubricItems.length > 0) {
35489
35499
  const rubricEvaluator = {
35490
35500
  name: "rubric",
35491
- type: "rubric",
35501
+ type: "llm_judge",
35492
35502
  rubrics: rubricItems
35493
35503
  };
35494
35504
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -35887,7 +35897,7 @@ async function withRetry(fn, retryConfig, signal) {
35887
35897
  }
35888
35898
  var execAsync2 = promisify2(execWithCallback);
35889
35899
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
35890
- async function defaultCommandRunner(command, options) {
35900
+ async function defaultCommandRunner(command5, options) {
35891
35901
  const execOptions = {
35892
35902
  cwd: options.cwd,
35893
35903
  env: options.env,
@@ -35897,7 +35907,7 @@ async function defaultCommandRunner(command, options) {
35897
35907
  shell: process.platform === "win32" ? "powershell.exe" : void 0
35898
35908
  };
35899
35909
  try {
35900
- const { stdout, stderr } = await execAsync2(command, execOptions);
35910
+ const { stdout, stderr } = await execAsync2(command5, execOptions);
35901
35911
  return {
35902
35912
  stdout,
35903
35913
  stderr,
@@ -37321,144 +37331,6 @@ function createProvider(target) {
37321
37331
  }
37322
37332
  }
37323
37333
  }
37324
- var rubricCheckResultSchema = external_exports.object({
37325
- id: external_exports.string().describe("The ID of the rubric item being checked"),
37326
- satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37327
- reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37328
- });
37329
- var rubricEvaluationSchema = external_exports.object({
37330
- checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37331
- overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37332
- });
37333
- var RubricEvaluator = class {
37334
- kind = "rubric";
37335
- config;
37336
- resolveJudgeProvider;
37337
- constructor(options) {
37338
- this.config = options.config;
37339
- this.resolveJudgeProvider = options.resolveJudgeProvider;
37340
- }
37341
- async evaluate(context) {
37342
- const judgeProvider = await this.resolveJudgeProvider(context);
37343
- if (!judgeProvider) {
37344
- throw new Error("No judge provider available for rubric evaluation");
37345
- }
37346
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
37347
- throw new Error(
37348
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
37349
- );
37350
- }
37351
- const prompt = this.buildPrompt(context, this.config.rubrics);
37352
- const model = judgeProvider.asLanguageModel?.();
37353
- if (!model) {
37354
- throw new Error("Judge provider does not support language model interface");
37355
- }
37356
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37357
- You must return a valid JSON object matching this schema:
37358
- {
37359
- "checks": [
37360
- {
37361
- "id": "string (rubric id)",
37362
- "satisfied": boolean,
37363
- "reasoning": "string (brief explanation)"
37364
- }
37365
- ],
37366
- "overall_reasoning": "string (summary)"
37367
- }`;
37368
- let result;
37369
- let lastError;
37370
- for (let attempt = 1; attempt <= 3; attempt++) {
37371
- try {
37372
- const { text: text2 } = await generateText({
37373
- model,
37374
- system,
37375
- prompt
37376
- });
37377
- const cleaned = text2.replace(/```json\n?|```/g, "").trim();
37378
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
37379
- break;
37380
- } catch (e) {
37381
- lastError = e instanceof Error ? e : new Error(String(e));
37382
- }
37383
- }
37384
- if (!result) {
37385
- throw new Error(
37386
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
37387
- );
37388
- }
37389
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
37390
- return {
37391
- score,
37392
- verdict,
37393
- hits,
37394
- misses,
37395
- expectedAspectCount: this.config.rubrics.length,
37396
- reasoning: result.overall_reasoning,
37397
- evaluatorRawRequest: {
37398
- prompt
37399
- }
37400
- };
37401
- }
37402
- buildPrompt(context, rubrics) {
37403
- const parts = [
37404
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37405
- "",
37406
- "[[ ## question ## ]]",
37407
- context.evalCase.question,
37408
- "",
37409
- "[[ ## expected_outcome ## ]]",
37410
- context.evalCase.expected_outcome,
37411
- ""
37412
- ];
37413
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37414
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37415
- }
37416
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37417
- for (const rubric of rubrics) {
37418
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37419
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37420
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37421
- }
37422
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37423
- return parts.join("\n");
37424
- }
37425
- calculateScore(result, rubrics) {
37426
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
37427
- const hits = [];
37428
- const misses = [];
37429
- let totalWeight = 0;
37430
- let earnedWeight = 0;
37431
- let failedRequired = false;
37432
- for (const check2 of result.checks) {
37433
- const rubric = rubricMap.get(check2.id);
37434
- if (!rubric) {
37435
- continue;
37436
- }
37437
- totalWeight += rubric.weight;
37438
- if (check2.satisfied) {
37439
- earnedWeight += rubric.weight;
37440
- hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37441
- } else {
37442
- misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37443
- if (rubric.required) {
37444
- failedRequired = true;
37445
- }
37446
- }
37447
- }
37448
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37449
- let verdict;
37450
- if (failedRequired) {
37451
- verdict = "fail";
37452
- } else if (score >= 0.8) {
37453
- verdict = "pass";
37454
- } else if (score >= 0.6) {
37455
- verdict = "borderline";
37456
- } else {
37457
- verdict = "fail";
37458
- }
37459
- return { score, verdict, hits, misses };
37460
- }
37461
- };
37462
37334
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
37463
37335
 
37464
37336
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -37476,6 +37348,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
37476
37348
 
37477
37349
  [[ ## candidate_answer ## ]]
37478
37350
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
37351
+ var freeformEvaluationSchema = external_exports.object({
37352
+ score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
37353
+ hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
37354
+ misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
37355
+ reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
37356
+ });
37357
+ var rubricCheckResultSchema = external_exports.object({
37358
+ id: external_exports.string().describe("The ID of the rubric item being checked"),
37359
+ satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37360
+ reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37361
+ });
37362
+ var rubricEvaluationSchema = external_exports.object({
37363
+ checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37364
+ overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37365
+ });
37479
37366
  var LlmJudgeEvaluator = class {
37480
37367
  kind = "llm_judge";
37481
37368
  resolveJudgeProvider;
@@ -37493,9 +37380,13 @@ var LlmJudgeEvaluator = class {
37493
37380
  if (!judgeProvider) {
37494
37381
  throw new Error("No judge provider available for LLM grading");
37495
37382
  }
37496
- return this.evaluateWithPrompt(context, judgeProvider);
37383
+ const config2 = context.evaluator;
37384
+ if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
37385
+ return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
37386
+ }
37387
+ return this.evaluateFreeform(context, judgeProvider);
37497
37388
  }
37498
- async evaluateWithPrompt(context, judgeProvider) {
37389
+ async evaluateFreeform(context, judgeProvider) {
37499
37390
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37500
37391
  const variables = {
37501
37392
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -37512,34 +37403,132 @@ var LlmJudgeEvaluator = class {
37512
37403
  const systemPrompt = buildOutputSchema();
37513
37404
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
37514
37405
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
37515
- const response = await judgeProvider.invoke({
37516
- question: userPrompt,
37517
- systemPrompt,
37518
- evalCaseId: context.evalCase.id,
37519
- attempt: context.attempt,
37520
- maxOutputTokens: this.maxOutputTokens,
37521
- temperature: this.temperature
37522
- });
37523
- const parsed = parseQualityResponse(response);
37524
- const score = clampScore(parsed.score ?? 0);
37525
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
37526
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
37527
- const reasoning = parsed.reasoning ?? response.reasoning;
37528
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37529
37406
  const evaluatorRawRequest = {
37530
37407
  userPrompt,
37531
37408
  systemPrompt,
37532
37409
  target: judgeProvider.targetName
37533
37410
  };
37411
+ try {
37412
+ const { data, providerResponse } = await this.runWithRetry({
37413
+ context,
37414
+ judgeProvider,
37415
+ systemPrompt,
37416
+ userPrompt,
37417
+ schema: freeformEvaluationSchema
37418
+ });
37419
+ const score = clampScore(data.score);
37420
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
37421
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
37422
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
37423
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37424
+ return {
37425
+ score,
37426
+ verdict: scoreToVerdict(score),
37427
+ hits,
37428
+ misses,
37429
+ expectedAspectCount,
37430
+ reasoning,
37431
+ evaluatorRawRequest
37432
+ };
37433
+ } catch {
37434
+ return {
37435
+ score: 0,
37436
+ verdict: "fail",
37437
+ hits: [],
37438
+ misses: [],
37439
+ expectedAspectCount: 1,
37440
+ evaluatorRawRequest
37441
+ };
37442
+ }
37443
+ }
37444
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
37445
+ if (!rubrics || rubrics.length === 0) {
37446
+ throw new Error(
37447
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
37448
+ );
37449
+ }
37450
+ const prompt = this.buildRubricPrompt(context, rubrics);
37451
+ const systemPrompt = buildRubricOutputSchema();
37452
+ const evaluatorRawRequest = {
37453
+ userPrompt: prompt,
37454
+ systemPrompt,
37455
+ target: judgeProvider.targetName
37456
+ };
37457
+ const { data } = await this.runWithRetry({
37458
+ context,
37459
+ judgeProvider,
37460
+ systemPrompt,
37461
+ userPrompt: prompt,
37462
+ schema: rubricEvaluationSchema
37463
+ });
37464
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
37534
37465
  return {
37535
37466
  score,
37467
+ verdict,
37536
37468
  hits,
37537
37469
  misses,
37538
- expectedAspectCount,
37539
- reasoning,
37470
+ expectedAspectCount: rubrics.length,
37471
+ reasoning: data.overall_reasoning,
37540
37472
  evaluatorRawRequest
37541
37473
  };
37542
37474
  }
37475
+ buildRubricPrompt(context, rubrics) {
37476
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37477
+ const parts = [
37478
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37479
+ "",
37480
+ "[[ ## question ## ]]",
37481
+ formattedQuestion,
37482
+ "",
37483
+ "[[ ## expected_outcome ## ]]",
37484
+ context.evalCase.expected_outcome,
37485
+ ""
37486
+ ];
37487
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37488
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37489
+ }
37490
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37491
+ for (const rubric of rubrics) {
37492
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37493
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37494
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37495
+ }
37496
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37497
+ return parts.join("\n");
37498
+ }
37499
+ async runWithRetry(options) {
37500
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
37501
+ let lastError;
37502
+ for (let attempt = 1; attempt <= 3; attempt++) {
37503
+ try {
37504
+ const model = judgeProvider.asLanguageModel?.();
37505
+ if (model) {
37506
+ const { text: text2 } = await generateText({
37507
+ model,
37508
+ system: systemPrompt,
37509
+ prompt: userPrompt,
37510
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
37511
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
37512
+ });
37513
+ const data2 = schema.parse(parseJsonFromText(text2));
37514
+ return { data: data2 };
37515
+ }
37516
+ const response = await judgeProvider.invoke({
37517
+ question: userPrompt,
37518
+ systemPrompt,
37519
+ evalCaseId: context.evalCase.id,
37520
+ attempt: context.attempt,
37521
+ maxOutputTokens: this.maxOutputTokens,
37522
+ temperature: this.temperature
37523
+ });
37524
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
37525
+ return { data, providerResponse: response };
37526
+ } catch (e) {
37527
+ lastError = e instanceof Error ? e : new Error(String(e));
37528
+ }
37529
+ }
37530
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
37531
+ }
37543
37532
  };
37544
37533
  function buildOutputSchema() {
37545
37534
  return [
@@ -37553,6 +37542,29 @@ function buildOutputSchema() {
37553
37542
  "}"
37554
37543
  ].join("\n");
37555
37544
  }
37545
+ function buildRubricOutputSchema() {
37546
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37547
+ You must return a valid JSON object matching this schema:
37548
+ {
37549
+ "checks": [
37550
+ {
37551
+ "id": "string (rubric id)",
37552
+ "satisfied": boolean,
37553
+ "reasoning": "string (brief explanation)"
37554
+ }
37555
+ ],
37556
+ "overall_reasoning": "string (summary)"
37557
+ }`;
37558
+ }
37559
+ function scoreToVerdict(score) {
37560
+ if (score >= 0.8) {
37561
+ return "pass";
37562
+ }
37563
+ if (score >= 0.6) {
37564
+ return "borderline";
37565
+ }
37566
+ return "fail";
37567
+ }
37556
37568
  function clampScore(value) {
37557
37569
  if (Number.isNaN(value) || !Number.isFinite(value)) {
37558
37570
  return 0;
@@ -37565,71 +37577,15 @@ function clampScore(value) {
37565
37577
  }
37566
37578
  return value;
37567
37579
  }
37568
- function parseQualityResponse(response) {
37569
- const text2 = typeof response.text === "string" ? response.text.trim() : "";
37570
- if (text2.length === 0) {
37571
- return {};
37572
- }
37573
- const direct = attemptParseJson(text2);
37574
- if (direct && validateQualityJson(direct)) {
37575
- return direct;
37576
- }
37577
- const extracted = extractJsonBlob(text2);
37578
- if (extracted) {
37579
- const parsed = attemptParseJson(extracted);
37580
- if (parsed && validateQualityJson(parsed)) {
37581
- return parsed;
37582
- }
37583
- }
37584
- return {};
37585
- }
37586
- function attemptParseJson(text2) {
37587
- try {
37588
- const parsed = JSON.parse(text2);
37589
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
37590
- const hits = parsed.hits;
37591
- const misses = parsed.misses;
37592
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
37593
- return { score, hits, misses, reasoning };
37594
- } catch {
37595
- return void 0;
37596
- }
37597
- }
37598
- function validateQualityJson(parsed) {
37599
- if (typeof parsed.score !== "number") {
37600
- return false;
37601
- }
37602
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
37603
- return false;
37604
- }
37605
- if (parsed.score < 0 || parsed.score > 1) {
37606
- return false;
37607
- }
37608
- if (parsed.hits !== void 0) {
37609
- if (!Array.isArray(parsed.hits)) {
37610
- return false;
37611
- }
37612
- if (!parsed.hits.every((item) => typeof item === "string")) {
37613
- return false;
37614
- }
37615
- }
37616
- if (parsed.misses !== void 0) {
37617
- if (!Array.isArray(parsed.misses)) {
37618
- return false;
37619
- }
37620
- if (!parsed.misses.every((item) => typeof item === "string")) {
37621
- return false;
37622
- }
37623
- }
37624
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
37625
- return false;
37626
- }
37627
- return true;
37628
- }
37629
37580
  function extractJsonBlob(text2) {
37630
37581
  const match = text2.match(/\{[\s\S]*\}/);
37631
37582
  return match?.[0];
37632
37583
  }
37584
+ function parseJsonFromText(text2) {
37585
+ const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
37586
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
37587
+ return JSON.parse(blob);
37588
+ }
37633
37589
  function isNonEmptyString(value) {
37634
37590
  return typeof value === "string" && value.trim().length > 0;
37635
37591
  }
@@ -37666,6 +37622,7 @@ var CodeEvaluator = class {
37666
37622
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
37667
37623
  return {
37668
37624
  score,
37625
+ verdict: scoreToVerdict(score),
37669
37626
  hits,
37670
37627
  misses,
37671
37628
  expectedAspectCount: hits.length + misses.length || 1,
@@ -37679,6 +37636,7 @@ var CodeEvaluator = class {
37679
37636
  const message = error40 instanceof Error ? error40.message : String(error40);
37680
37637
  return {
37681
37638
  score: 0,
37639
+ verdict: "fail",
37682
37640
  hits: [],
37683
37641
  misses: [`Code evaluator failed: ${message}`],
37684
37642
  expectedAspectCount: 1,
@@ -37692,6 +37650,33 @@ var CodeEvaluator = class {
37692
37650
  }
37693
37651
  }
37694
37652
  };
37653
+ function calculateRubricScore(result, rubrics) {
37654
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
37655
+ const hits = [];
37656
+ const misses = [];
37657
+ let totalWeight = 0;
37658
+ let earnedWeight = 0;
37659
+ let failedRequired = false;
37660
+ for (const check2 of result.checks) {
37661
+ const rubric = rubricMap.get(check2.id);
37662
+ if (!rubric) {
37663
+ continue;
37664
+ }
37665
+ totalWeight += rubric.weight;
37666
+ if (check2.satisfied) {
37667
+ earnedWeight += rubric.weight;
37668
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37669
+ } else {
37670
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37671
+ if (rubric.required) {
37672
+ failedRequired = true;
37673
+ }
37674
+ }
37675
+ }
37676
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37677
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
37678
+ return { score, verdict, hits, misses };
37679
+ }
37695
37680
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
37696
37681
  const { spawn: spawn22 } = await import("node:child_process");
37697
37682
  return await new Promise((resolve2, reject) => {
@@ -37821,7 +37806,7 @@ function pLimit(concurrency) {
37821
37806
  activeCount--;
37822
37807
  resumeNext();
37823
37808
  };
37824
- const run = async (function_, resolve2, arguments_) => {
37809
+ const run2 = async (function_, resolve2, arguments_) => {
37825
37810
  const result = (async () => function_(...arguments_))();
37826
37811
  resolve2(result);
37827
37812
  try {
@@ -37834,7 +37819,7 @@ function pLimit(concurrency) {
37834
37819
  new Promise((internalResolve) => {
37835
37820
  queue.enqueue(internalResolve);
37836
37821
  }).then(
37837
- run.bind(void 0, function_, resolve2, arguments_)
37822
+ run2.bind(void 0, function_, resolve2, arguments_)
37838
37823
  );
37839
37824
  (async () => {
37840
37825
  await Promise.resolve();
@@ -38417,7 +38402,6 @@ async function runEvaluatorList(options) {
38417
38402
  reasoning: score2.reasoning,
38418
38403
  evaluator_provider_request: score2.evaluatorRawRequest
38419
38404
  });
38420
- continue;
38421
38405
  }
38422
38406
  if (evaluator.type === "code") {
38423
38407
  const codeEvaluator = new CodeEvaluator({
@@ -38445,44 +38429,12 @@ async function runEvaluatorList(options) {
38445
38429
  reasoning: score2.reasoning,
38446
38430
  evaluator_provider_request: score2.evaluatorRawRequest
38447
38431
  });
38448
- continue;
38449
- }
38450
- if (evaluator.type === "rubric") {
38451
- const rubricEvaluator = new RubricEvaluator({
38452
- config: evaluator,
38453
- resolveJudgeProvider: async (context) => {
38454
- if (context.judgeProvider) {
38455
- return context.judgeProvider;
38456
- }
38457
- return judgeProvider;
38458
- }
38459
- });
38460
- const score2 = await rubricEvaluator.evaluate({
38461
- evalCase,
38462
- candidate,
38463
- target,
38464
- provider,
38465
- attempt,
38466
- promptInputs,
38467
- now,
38468
- judgeProvider
38469
- });
38470
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
38471
- evaluatorResults.push({
38472
- name: evaluator.name,
38473
- type: evaluator.type,
38474
- score: score2.score,
38475
- verdict: score2.verdict,
38476
- hits: score2.hits,
38477
- misses: score2.misses,
38478
- reasoning: score2.reasoning,
38479
- evaluator_provider_request: score2.evaluatorRawRequest
38480
- });
38481
38432
  }
38482
38433
  } catch (error40) {
38483
38434
  const message = error40 instanceof Error ? error40.message : String(error40);
38484
38435
  const fallbackScore = {
38485
38436
  score: 0,
38437
+ verdict: "fail",
38486
38438
  hits: [],
38487
38439
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
38488
38440
  expectedAspectCount: 1,
@@ -38497,6 +38449,7 @@ async function runEvaluatorList(options) {
38497
38449
  name: evaluator.name ?? "unknown",
38498
38450
  type: evaluator.type ?? "unknown",
38499
38451
  score: 0,
38452
+ verdict: "fail",
38500
38453
  hits: [],
38501
38454
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
38502
38455
  reasoning: message
@@ -38515,6 +38468,7 @@ async function runEvaluatorList(options) {
38515
38468
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
38516
38469
  const score = {
38517
38470
  score: aggregateScore,
38471
+ verdict: scoreToVerdict2(aggregateScore),
38518
38472
  hits,
38519
38473
  misses,
38520
38474
  expectedAspectCount,
@@ -38565,6 +38519,15 @@ async function resolveCustomPrompt(config2) {
38565
38519
  function isNonEmptyString2(value) {
38566
38520
  return typeof value === "string" && value.trim().length > 0;
38567
38521
  }
38522
+ function scoreToVerdict2(score) {
38523
+ if (score >= 0.8) {
38524
+ return "pass";
38525
+ }
38526
+ if (score >= 0.6) {
38527
+ return "borderline";
38528
+ }
38529
+ return "fail";
38530
+ }
38568
38531
  function filterEvalCases(evalCases, evalId) {
38569
38532
  if (!evalId) {
38570
38533
  return evalCases;
@@ -38778,9 +38741,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
38778
38741
  }
38779
38742
  return parts.join("\n");
38780
38743
  }
38781
- function createAgentKernel() {
38782
- return { status: "stub" };
38783
- }
38784
38744
 
38785
38745
  // src/commands/eval/env.ts
38786
38746
  import { constants as constants4 } from "node:fs";
@@ -39513,17 +39473,18 @@ function formatEvaluationSummary(summary) {
39513
39473
 
39514
39474
  // ../../packages/core/dist/evaluation/validation/index.js
39515
39475
  import { readFile as readFile7 } from "node:fs/promises";
39476
+ import path16 from "node:path";
39516
39477
  import { parse as parse6 } from "yaml";
39517
39478
  import { readFile as readFile23 } from "node:fs/promises";
39518
- import path16 from "node:path";
39479
+ import path23 from "node:path";
39519
39480
  import { parse as parse23 } from "yaml";
39520
39481
  import { readFile as readFile33 } from "node:fs/promises";
39521
- import path23 from "node:path";
39482
+ import path33 from "node:path";
39522
39483
  import { parse as parse33 } from "yaml";
39523
39484
  import { readFile as readFile43 } from "node:fs/promises";
39524
39485
  import { parse as parse42 } from "yaml";
39525
39486
  import { readFile as readFile52 } from "node:fs/promises";
39526
- import path33 from "node:path";
39487
+ import path43 from "node:path";
39527
39488
  import { parse as parse52 } from "yaml";
39528
39489
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
39529
39490
  var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
@@ -39533,12 +39494,12 @@ async function detectFileType(filePath) {
39533
39494
  const content = await readFile7(filePath, "utf8");
39534
39495
  const parsed = parse6(content);
39535
39496
  if (typeof parsed !== "object" || parsed === null) {
39536
- return "unknown";
39497
+ return inferFileTypeFromPath(filePath);
39537
39498
  }
39538
39499
  const record2 = parsed;
39539
39500
  const schema = record2.$schema;
39540
39501
  if (typeof schema !== "string") {
39541
- return "unknown";
39502
+ return inferFileTypeFromPath(filePath);
39542
39503
  }
39543
39504
  switch (schema) {
39544
39505
  case SCHEMA_EVAL_V2:
@@ -39548,18 +39509,31 @@ async function detectFileType(filePath) {
39548
39509
  case SCHEMA_CONFIG_V22:
39549
39510
  return "config";
39550
39511
  default:
39551
- return "unknown";
39512
+ return inferFileTypeFromPath(filePath);
39552
39513
  }
39553
39514
  } catch {
39554
- return "unknown";
39515
+ return inferFileTypeFromPath(filePath);
39555
39516
  }
39556
39517
  }
39518
+ function inferFileTypeFromPath(filePath) {
39519
+ const normalized = path16.normalize(filePath).replace(/\\/g, "/");
39520
+ const basename = path16.basename(filePath);
39521
+ if (normalized.includes("/.agentv/")) {
39522
+ if (basename === "config.yaml" || basename === "config.yml") {
39523
+ return "config";
39524
+ }
39525
+ if (basename === "targets.yaml" || basename === "targets.yml") {
39526
+ return "targets";
39527
+ }
39528
+ }
39529
+ return "eval";
39530
+ }
39557
39531
  function isObject2(value) {
39558
39532
  return typeof value === "object" && value !== null && !Array.isArray(value);
39559
39533
  }
39560
39534
  async function validateEvalFile(filePath) {
39561
39535
  const errors = [];
39562
- const absolutePath = path16.resolve(filePath);
39536
+ const absolutePath = path23.resolve(filePath);
39563
39537
  let parsed;
39564
39538
  try {
39565
39539
  const content = await readFile23(absolutePath, "utf8");
@@ -39908,7 +39882,7 @@ function validateUnknownSettings(target, provider, absolutePath, location, error
39908
39882
  }
39909
39883
  async function validateTargetsFile(filePath) {
39910
39884
  const errors = [];
39911
- const absolutePath = path23.resolve(filePath);
39885
+ const absolutePath = path33.resolve(filePath);
39912
39886
  let parsed;
39913
39887
  try {
39914
39888
  const content = await readFile33(absolutePath, "utf8");
@@ -40187,8 +40161,8 @@ async function validateConfigFile(filePath) {
40187
40161
  }
40188
40162
  const config2 = parsed;
40189
40163
  const schema = config2.$schema;
40190
- if (schema !== SCHEMA_CONFIG_V222) {
40191
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}'. Expected '${SCHEMA_CONFIG_V222}'` : `Missing required field '$schema'. Please add '$schema: ${SCHEMA_CONFIG_V222}' at the top of the file.`;
40164
+ if (schema !== void 0 && schema !== SCHEMA_CONFIG_V222) {
40165
+ const message = `Invalid $schema value '${schema}'. Expected '${SCHEMA_CONFIG_V222}' or omit the field.`;
40192
40166
  errors.push({
40193
40167
  severity: "error",
40194
40168
  filePath,
@@ -40250,7 +40224,7 @@ function isObject3(value) {
40250
40224
  }
40251
40225
  async function validateFileReferences(evalFilePath) {
40252
40226
  const errors = [];
40253
- const absolutePath = path33.resolve(evalFilePath);
40227
+ const absolutePath = path43.resolve(evalFilePath);
40254
40228
  const gitRoot = await findGitRoot(absolutePath);
40255
40229
  if (!gitRoot) {
40256
40230
  errors.push({
@@ -40607,12 +40581,12 @@ function buildDefaultOutputPath(cwd, format) {
40607
40581
  const extension = getDefaultExtension(format);
40608
40582
  return path18.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
40609
40583
  }
40610
- function resolvePromptDirectory(option, cwd) {
40611
- if (option === void 0) {
40584
+ function resolvePromptDirectory(option4, cwd) {
40585
+ if (option4 === void 0) {
40612
40586
  return void 0;
40613
40587
  }
40614
- if (typeof option === "string" && option.trim().length > 0) {
40615
- return path18.resolve(cwd, option);
40588
+ if (typeof option4 === "string" && option4.trim().length > 0) {
40589
+ return path18.resolve(cwd, option4);
40616
40590
  }
40617
40591
  return path18.join(cwd, ".agentv", "prompts");
40618
40592
  }
@@ -40916,56 +40890,119 @@ async function resolveEvaluationRunner() {
40916
40890
  }
40917
40891
 
40918
40892
  // src/commands/eval/index.ts
40919
- function parseInteger(value, fallback) {
40920
- const parsed = Number.parseInt(value, 10);
40921
- if (Number.isNaN(parsed)) {
40922
- return fallback;
40923
- }
40924
- return parsed;
40925
- }
40926
- function registerEvalCommand(program) {
40927
- program.command("eval").description("Run eval suites and report results").argument("<eval-paths...>", "Path(s) or glob(s) to evaluation .yaml file(s)").option("--target <name>", "Override target name from targets.yaml", "default").option("--targets <path>", "Path to targets.yaml (overrides discovery)").option("--eval-id <id>", "Run only the eval case with this identifier").option(
40928
- "--workers <count>",
40929
- "Number of parallel workers (default: 1, max: 50). Can also be set per-target in targets.yaml",
40930
- (value) => parseInteger(value, 1)
40931
- ).option("--out <path>", "Write results to the specified path").option(
40932
- "--output-format <format>",
40933
- "Output format: 'jsonl' or 'yaml' (default: jsonl)",
40934
- "jsonl"
40935
- ).option("--dry-run", "Use mock provider responses instead of real LLM calls", false).option(
40936
- "--dry-run-delay <ms>",
40937
- "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
40938
- (value) => parseInteger(value, 0),
40939
- 0
40940
- ).option(
40941
- "--dry-run-delay-min <ms>",
40942
- "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
40943
- (value) => parseInteger(value, 0),
40944
- 0
40945
- ).option(
40946
- "--dry-run-delay-max <ms>",
40947
- "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
40948
- (value) => parseInteger(value, 0),
40949
- 0
40950
- ).option(
40951
- "--agent-timeout <seconds>",
40952
- "Timeout in seconds for provider responses (default: 120)",
40953
- (value) => parseInteger(value, 120),
40954
- 120
40955
- ).option(
40956
- "--max-retries <count>",
40957
- "Retry count for timeout recoveries (default: 2)",
40958
- (value) => parseInteger(value, 2),
40959
- 2
40960
- ).option("--cache", "Enable in-memory provider response cache", false).option("--verbose", "Enable verbose logging", false).option(
40961
- "--dump-prompts [dir]",
40962
- "Persist prompt payloads for debugging (optional custom directory)"
40963
- ).action(async (evalPaths, rawOptions) => {
40964
- const resolvedPaths = await resolveEvalPaths(evalPaths, process.cwd());
40893
+ var evalCommand = command({
40894
+ name: "eval",
40895
+ description: "Run eval suites and report results",
40896
+ args: {
40897
+ evalPaths: restPositionals({
40898
+ type: string4,
40899
+ displayName: "eval-paths",
40900
+ description: "Path(s) or glob(s) to evaluation .yaml file(s)"
40901
+ }),
40902
+ target: option({
40903
+ type: string4,
40904
+ long: "target",
40905
+ description: "Override target name from targets.yaml",
40906
+ defaultValue: () => "default"
40907
+ }),
40908
+ targets: option({
40909
+ type: optional2(string4),
40910
+ long: "targets",
40911
+ description: "Path to targets.yaml (overrides discovery)"
40912
+ }),
40913
+ evalId: option({
40914
+ type: optional2(string4),
40915
+ long: "eval-id",
40916
+ description: "Run only the eval case with this identifier"
40917
+ }),
40918
+ workers: option({
40919
+ type: number4,
40920
+ long: "workers",
40921
+ description: "Number of parallel workers (default: 1, max: 50). Can also be set per-target in targets.yaml",
40922
+ defaultValue: () => 1
40923
+ }),
40924
+ out: option({
40925
+ type: optional2(string4),
40926
+ long: "out",
40927
+ description: "Write results to the specified path"
40928
+ }),
40929
+ outputFormat: option({
40930
+ type: string4,
40931
+ long: "output-format",
40932
+ description: "Output format: 'jsonl' or 'yaml' (default: jsonl)",
40933
+ defaultValue: () => "jsonl"
40934
+ }),
40935
+ dryRun: flag({
40936
+ long: "dry-run",
40937
+ description: "Use mock provider responses instead of real LLM calls"
40938
+ }),
40939
+ dryRunDelay: option({
40940
+ type: number4,
40941
+ long: "dry-run-delay",
40942
+ description: "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
40943
+ defaultValue: () => 0
40944
+ }),
40945
+ dryRunDelayMin: option({
40946
+ type: number4,
40947
+ long: "dry-run-delay-min",
40948
+ description: "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
40949
+ defaultValue: () => 0
40950
+ }),
40951
+ dryRunDelayMax: option({
40952
+ type: number4,
40953
+ long: "dry-run-delay-max",
40954
+ description: "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
40955
+ defaultValue: () => 0
40956
+ }),
40957
+ agentTimeout: option({
40958
+ type: number4,
40959
+ long: "agent-timeout",
40960
+ description: "Timeout in seconds for provider responses (default: 120)",
40961
+ defaultValue: () => 120
40962
+ }),
40963
+ maxRetries: option({
40964
+ type: number4,
40965
+ long: "max-retries",
40966
+ description: "Retry count for timeout recoveries (default: 2)",
40967
+ defaultValue: () => 2
40968
+ }),
40969
+ cache: flag({
40970
+ long: "cache",
40971
+ description: "Enable in-memory provider response cache"
40972
+ }),
40973
+ verbose: flag({
40974
+ long: "verbose",
40975
+ description: "Enable verbose logging"
40976
+ }),
40977
+ dumpPrompts: option({
40978
+ type: optional2(string4),
40979
+ long: "dump-prompts",
40980
+ description: "Directory path for persisting prompt payloads for debugging"
40981
+ })
40982
+ },
40983
+ handler: async (args) => {
40984
+ const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
40985
+ const dumpPrompts = args.dumpPrompts !== void 0 ? args.dumpPrompts === "." ? true : args.dumpPrompts : void 0;
40986
+ const rawOptions = {
40987
+ target: args.target,
40988
+ targets: args.targets,
40989
+ evalId: args.evalId,
40990
+ workers: args.workers,
40991
+ out: args.out,
40992
+ outputFormat: args.outputFormat,
40993
+ dryRun: args.dryRun,
40994
+ dryRunDelay: args.dryRunDelay,
40995
+ dryRunDelayMin: args.dryRunDelayMin,
40996
+ dryRunDelayMax: args.dryRunDelayMax,
40997
+ agentTimeout: args.agentTimeout,
40998
+ maxRetries: args.maxRetries,
40999
+ cache: args.cache,
41000
+ verbose: args.verbose,
41001
+ dumpPrompts
41002
+ };
40965
41003
  await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
40966
- });
40967
- return program;
40968
- }
41004
+ }
41005
+ });
40969
41006
  async function resolveEvalPaths(evalPaths, cwd) {
40970
41007
  const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
40971
41008
  if (normalizedInputs.length === 0) {
@@ -41013,6 +41050,9 @@ async function resolveEvalPaths(evalPaths, cwd) {
41013
41050
  return sorted;
41014
41051
  }
41015
41052
 
41053
+ // src/commands/generate/index.ts
41054
+ import { command as command2, flag as flag2, option as option2, optional as optional3, positional as positional2, string as string5, subcommands } from "cmd-ts";
41055
+
41016
41056
  // src/commands/generate/rubrics.ts
41017
41057
  import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
41018
41058
  import path20 from "node:path";
@@ -41154,29 +41194,53 @@ function extractQuestion(evalCase) {
41154
41194
  }
41155
41195
 
41156
41196
  // src/commands/generate/index.ts
41157
- function registerGenerateCommand(program) {
41158
- const generate = program.command("generate").description("Generate evaluation artifacts");
41159
- generate.command("rubrics <file>").description("Generate rubrics from expected_outcome in YAML eval file").option(
41160
- "-t, --target <target>",
41161
- "Override target for rubric generation (default: file target or openai:gpt-4o)"
41162
- ).option("-v, --verbose", "Show detailed progress").action(async (file2, options) => {
41197
+ var rubricsCommand = command2({
41198
+ name: "rubrics",
41199
+ description: "Generate rubrics from expected_outcome in YAML eval file",
41200
+ args: {
41201
+ file: positional2({
41202
+ type: string5,
41203
+ displayName: "file",
41204
+ description: "Path to YAML eval file"
41205
+ }),
41206
+ target: option2({
41207
+ type: optional3(string5),
41208
+ long: "target",
41209
+ short: "t",
41210
+ description: "Override target for rubric generation (default: file target or openai:gpt-4o)"
41211
+ }),
41212
+ verbose: flag2({
41213
+ long: "verbose",
41214
+ short: "v",
41215
+ description: "Show detailed progress"
41216
+ })
41217
+ },
41218
+ handler: async ({ file: file2, target, verbose }) => {
41163
41219
  try {
41164
41220
  await generateRubricsCommand({
41165
41221
  file: file2,
41166
- target: options.target,
41167
- verbose: options.verbose
41222
+ target,
41223
+ verbose
41168
41224
  });
41169
41225
  } catch (error40) {
41170
41226
  console.error(`Error: ${error40.message}`);
41171
41227
  process.exit(1);
41172
41228
  }
41173
- });
41174
- }
41229
+ }
41230
+ });
41231
+ var generateCommand = subcommands({
41232
+ name: "generate",
41233
+ description: "Generate evaluation artifacts",
41234
+ cmds: {
41235
+ rubrics: rubricsCommand
41236
+ }
41237
+ });
41175
41238
 
41176
41239
  // src/commands/init/index.ts
41177
41240
  import { existsSync, mkdirSync, writeFileSync } from "node:fs";
41178
41241
  import path24 from "node:path";
41179
41242
  import * as readline from "node:readline/promises";
41243
+ import { command as command3, option as option3, optional as optional4, string as string6 } from "cmd-ts";
41180
41244
 
41181
41245
  // src/templates/index.ts
41182
41246
  import { readFileSync, readdirSync, statSync } from "node:fs";
@@ -41355,15 +41419,28 @@ Files installed to ${path24.relative(targetPath, claudeDir)}:`);
41355
41419
  console.log(" 2. Configure targets in .agentv/targets.yaml");
41356
41420
  console.log(" 3. Create eval files using the schema and prompt templates");
41357
41421
  }
41422
+ var initCmdTsCommand = command3({
41423
+ name: "init",
41424
+ description: "Initialize AgentV in your project (installs prompt templates and schema to .github)",
41425
+ args: {
41426
+ path: option3({
41427
+ type: optional4(string6),
41428
+ long: "path",
41429
+ description: "Target directory for initialization (default: current directory)"
41430
+ })
41431
+ },
41432
+ handler: async ({ path: targetPath }) => {
41433
+ try {
41434
+ await initCommand({ targetPath });
41435
+ } catch (error40) {
41436
+ console.error(`Error: ${error40.message}`);
41437
+ process.exit(1);
41438
+ }
41439
+ }
41440
+ });
41358
41441
 
41359
- // src/commands/status.ts
41360
- function registerStatusCommand(program) {
41361
- program.command("status").description("Show the latest AgentV kernel status").action(() => {
41362
- const kernel = createAgentKernel();
41363
- console.log(`Kernel status: ${kernel.status}`);
41364
- });
41365
- return program;
41366
- }
41442
+ // src/commands/validate/index.ts
41443
+ import { command as command4, restPositionals as restPositionals2, string as string7 } from "cmd-ts";
41367
41444
 
41368
41445
  // src/commands/validate/format-output.ts
41369
41446
  var ANSI_RED3 = "\x1B[31m";
@@ -41468,20 +41545,6 @@ async function validateFiles(paths) {
41468
41545
  async function validateSingleFile(filePath) {
41469
41546
  const absolutePath = path25.resolve(filePath);
41470
41547
  const fileType = await detectFileType(absolutePath);
41471
- if (fileType === "unknown") {
41472
- return {
41473
- valid: false,
41474
- filePath: absolutePath,
41475
- fileType: "unknown",
41476
- errors: [
41477
- {
41478
- severity: "error",
41479
- filePath: absolutePath,
41480
- message: "Missing or invalid $schema field. File must declare schema: 'agentv-eval-v2', 'agentv-targets-v2', or 'agentv-config-v2'"
41481
- }
41482
- ]
41483
- };
41484
- }
41485
41548
  let result;
41486
41549
  if (fileType === "eval") {
41487
41550
  result = await validateEvalFile(absolutePath);
@@ -41551,7 +41614,7 @@ function isYamlFile(filePath) {
41551
41614
  }
41552
41615
 
41553
41616
  // src/commands/validate/index.ts
41554
- async function runValidateCommand(paths, _options) {
41617
+ async function runValidateCommand(paths) {
41555
41618
  if (paths.length === 0) {
41556
41619
  console.error("Error: No paths specified. Usage: agentv validate <paths...>");
41557
41620
  process.exit(1);
@@ -41563,47 +41626,45 @@ async function runValidateCommand(paths, _options) {
41563
41626
  process.exit(1);
41564
41627
  }
41565
41628
  }
41566
- function registerValidateCommand(program) {
41567
- program.command("validate").description("Validate AgentV eval and targets YAML files").argument("<paths...>", "Files or directories to validate").action(async (paths, _options) => {
41629
+ var validateCommand = command4({
41630
+ name: "validate",
41631
+ description: "Validate AgentV eval and targets YAML files",
41632
+ args: {
41633
+ paths: restPositionals2({
41634
+ type: string7,
41635
+ displayName: "paths",
41636
+ description: "Files or directories to validate"
41637
+ })
41638
+ },
41639
+ handler: async ({ paths }) => {
41568
41640
  try {
41569
- await runValidateCommand(paths, _options);
41641
+ await runValidateCommand(paths);
41570
41642
  } catch (error40) {
41571
41643
  console.error(`Error: ${error40.message}`);
41572
41644
  process.exit(1);
41573
41645
  }
41574
- });
41575
- return program;
41576
- }
41646
+ }
41647
+ });
41577
41648
 
41578
41649
  // src/index.ts
41579
41650
  var packageJson = JSON.parse(readFileSync2(new URL("../package.json", import.meta.url), "utf8"));
41580
- function createProgram() {
41581
- const program = new Command();
41582
- program.name("agentv").description("AgentV CLI scaffolding").version(packageJson.version);
41583
- registerStatusCommand(program);
41584
- registerEvalCommand(program);
41585
- registerValidateCommand(program);
41586
- registerGenerateCommand(program);
41587
- program.command("init [path]").description(
41588
- "Initialize AgentV in your project (installs prompt templates and schema to .github)"
41589
- ).action(async (targetPath) => {
41590
- try {
41591
- await initCommand({ targetPath });
41592
- } catch (error40) {
41593
- console.error(`Error: ${error40.message}`);
41594
- process.exit(1);
41595
- }
41596
- });
41597
- return program;
41598
- }
41651
+ var app = subcommands2({
41652
+ name: "agentv",
41653
+ description: "AgentV CLI",
41654
+ version: packageJson.version,
41655
+ cmds: {
41656
+ eval: evalCommand,
41657
+ validate: validateCommand,
41658
+ generate: generateCommand,
41659
+ init: initCmdTsCommand
41660
+ }
41661
+ });
41599
41662
  async function runCli(argv = process.argv) {
41600
- const program = createProgram();
41601
- await program.parseAsync(argv);
41602
- return program;
41663
+ await run(binary(app), argv);
41603
41664
  }
41604
41665
 
41605
41666
  export {
41606
- createProgram,
41667
+ app,
41607
41668
  runCli
41608
41669
  };
41609
- //# sourceMappingURL=chunk-MA3MJNJH.js.map
41670
+ //# sourceMappingURL=chunk-A5T7W63L.js.map