agentv 2.2.0 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -375,7 +375,7 @@ var compareCommand = command({
375
375
  import { readFileSync as readFileSync2, writeFileSync } from "node:fs";
376
376
  import path16 from "node:path";
377
377
 
378
- // ../../packages/core/dist/chunk-KDEP4I7G.js
378
+ // ../../packages/core/dist/chunk-RP3M7COZ.js
379
379
  import { constants } from "node:fs";
380
380
  import { access, readFile } from "node:fs/promises";
381
381
  import path from "node:path";
@@ -4422,7 +4422,7 @@ var coerce = {
4422
4422
  };
4423
4423
  var NEVER = INVALID;
4424
4424
 
4425
- // ../../packages/core/dist/chunk-KDEP4I7G.js
4425
+ // ../../packages/core/dist/chunk-RP3M7COZ.js
4426
4426
  async function fileExists(filePath) {
4427
4427
  try {
4428
4428
  await access(filePath, constants.F_OK);
@@ -5420,6 +5420,7 @@ function isAgentProvider(provider) {
5420
5420
  // ../../packages/core/dist/index.js
5421
5421
  import { readFile as readFile6 } from "node:fs/promises";
5422
5422
  import path72 from "node:path";
5423
+ import micromatch3 from "micromatch";
5423
5424
  import { parse as parse22 } from "yaml";
5424
5425
  import { readFile as readFile4 } from "node:fs/promises";
5425
5426
  import path22 from "node:path";
@@ -5432,6 +5433,7 @@ import path32 from "node:path";
5432
5433
  import { readFile as readFile22 } from "node:fs/promises";
5433
5434
  import { readFile as readFile42 } from "node:fs/promises";
5434
5435
  import path52 from "node:path";
5436
+ import micromatch2 from "micromatch";
5435
5437
  import { parse as parseYaml } from "yaml";
5436
5438
  import { readFile as readFile32 } from "node:fs/promises";
5437
5439
  import path42 from "node:path";
@@ -35077,6 +35079,7 @@ import { randomBytes } from "node:crypto";
35077
35079
  import { createServer } from "node:http";
35078
35080
  import { createHash } from "node:crypto";
35079
35081
  import path15 from "node:path";
35082
+ import micromatch4 from "micromatch";
35080
35083
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
35081
35084
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
35082
35085
  function isTestMessageRole(value) {
@@ -35452,11 +35455,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35452
35455
  );
35453
35456
  }
35454
35457
  }
35455
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
35456
- const config2 = {};
35458
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
35459
+ const config22 = {};
35457
35460
  for (const [key2, value] of Object.entries(rawEvaluator)) {
35458
- if (!knownProps.has(key2) && value !== void 0) {
35459
- config2[key2] = value;
35461
+ if (!knownProps2.has(key2) && value !== void 0) {
35462
+ config22[key2] = value;
35460
35463
  }
35461
35464
  }
35462
35465
  evaluators.push({
@@ -35466,7 +35469,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35466
35469
  cwd,
35467
35470
  resolvedCwd,
35468
35471
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35469
- ...Object.keys(config2).length > 0 ? { config: config2 } : {},
35472
+ ...Object.keys(config22).length > 0 ? { config: config22 } : {},
35470
35473
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
35471
35474
  });
35472
35475
  continue;
@@ -35631,7 +35634,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35631
35634
  continue;
35632
35635
  }
35633
35636
  const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35634
- const config2 = {
35637
+ const config22 = {
35635
35638
  name: name16,
35636
35639
  type: "tool_trajectory",
35637
35640
  mode,
@@ -35639,7 +35642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35639
35642
  ...expected ? { expected } : {},
35640
35643
  ...weight2 !== void 0 ? { weight: weight2 } : {}
35641
35644
  };
35642
- evaluators.push(config2);
35645
+ evaluators.push(config22);
35643
35646
  continue;
35644
35647
  }
35645
35648
  if (typeValue === "field_accuracy") {
@@ -35776,9 +35779,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35776
35779
  });
35777
35780
  continue;
35778
35781
  }
35779
- const prompt = asString(rawEvaluator.prompt);
35782
+ const rawPrompt = rawEvaluator.prompt;
35783
+ let prompt;
35780
35784
  let promptPath;
35781
- if (prompt) {
35785
+ let resolvedPromptScript;
35786
+ let promptScriptConfig;
35787
+ if (isJsonObject2(rawPrompt)) {
35788
+ const scriptArray = asStringArray(
35789
+ rawPrompt.script,
35790
+ `prompt.script for evaluator '${name16}' in '${evalId}'`
35791
+ );
35792
+ if (!scriptArray) {
35793
+ throw new Error(`Evaluator '${name16}' in '${evalId}': prompt object requires script array`);
35794
+ }
35795
+ const scriptPath = scriptArray[scriptArray.length - 1];
35796
+ const resolved = await resolveFileReference2(scriptPath, searchRoots);
35797
+ if (resolved.resolvedPath) {
35798
+ resolvedPromptScript = [...scriptArray.slice(0, -1), path32.resolve(resolved.resolvedPath)];
35799
+ } else {
35800
+ throw new Error(
35801
+ `Evaluator '${name16}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
35802
+ );
35803
+ }
35804
+ if (isJsonObject2(rawPrompt.config)) {
35805
+ promptScriptConfig = rawPrompt.config;
35806
+ }
35807
+ } else if (typeof rawPrompt === "string") {
35808
+ prompt = rawPrompt;
35782
35809
  const resolved = await resolveFileReference2(prompt, searchRoots);
35783
35810
  if (resolved.resolvedPath) {
35784
35811
  promptPath = path32.resolve(resolved.resolvedPath);
@@ -35797,12 +35824,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35797
35824
  }
35798
35825
  const _model = asString(rawEvaluator.model);
35799
35826
  const rawRubrics = rawEvaluator.rubrics;
35800
- const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
35801
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
35802
- description: asString(rubric.description) ?? "",
35803
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
35804
- required: typeof rubric.required === "boolean" ? rubric.required : true
35805
- })).filter((r) => r.description.length > 0) : void 0;
35827
+ const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name16, evalId) : void 0;
35806
35828
  if (typeValue === "rubric") {
35807
35829
  if (!parsedRubrics) {
35808
35830
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
@@ -35822,13 +35844,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35822
35844
  continue;
35823
35845
  }
35824
35846
  const weight = validateWeight(rawEvaluator.weight, name16, evalId);
35847
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
35848
+ const config2 = {};
35849
+ for (const [key2, value] of Object.entries(rawEvaluator)) {
35850
+ if (!knownProps.has(key2) && value !== void 0) {
35851
+ config2[key2] = value;
35852
+ }
35853
+ }
35854
+ const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
35855
+ const mergedConfig = { ...config2, ...topLevelConfig };
35856
+ const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
35825
35857
  evaluators.push({
35826
35858
  name: name16,
35827
35859
  type: "llm_judge",
35828
35860
  prompt,
35829
35861
  promptPath,
35862
+ ...promptPath ? { resolvedPromptPath: promptPath } : {},
35863
+ ...resolvedPromptScript ? { resolvedPromptScript } : {},
35830
35864
  ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
35831
- ...weight !== void 0 ? { weight } : {}
35865
+ ...weight !== void 0 ? { weight } : {},
35866
+ ...finalConfig ? { config: finalConfig } : {}
35832
35867
  });
35833
35868
  }
35834
35869
  return evaluators.length > 0 ? evaluators : void 0;
@@ -35915,6 +35950,185 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
35915
35950
  function isValidFieldAggregationType(value) {
35916
35951
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
35917
35952
  }
35953
+ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
35954
+ const items = [];
35955
+ for (const [index, rawRubric] of rawRubrics.entries()) {
35956
+ if (!isJsonObject2(rawRubric)) {
35957
+ logWarning2(
35958
+ `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
35959
+ );
35960
+ continue;
35961
+ }
35962
+ const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
35963
+ const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
35964
+ const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
35965
+ let requiredMinScore;
35966
+ let required2;
35967
+ if (typeof rawRubric.required_min_score === "number") {
35968
+ const minScore = rawRubric.required_min_score;
35969
+ if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
35970
+ throw new Error(
35971
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
35972
+ );
35973
+ }
35974
+ requiredMinScore = minScore;
35975
+ }
35976
+ if (typeof rawRubric.required === "boolean") {
35977
+ required2 = rawRubric.required;
35978
+ }
35979
+ let scoreRanges;
35980
+ const rawScoreRanges = rawRubric.score_ranges;
35981
+ if (rawScoreRanges !== void 0) {
35982
+ if (!Array.isArray(rawScoreRanges)) {
35983
+ throw new Error(
35984
+ `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
35985
+ );
35986
+ }
35987
+ scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
35988
+ items.push({
35989
+ id,
35990
+ weight,
35991
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
35992
+ ...required2 !== void 0 ? { required: required2 } : {},
35993
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
35994
+ score_ranges: scoreRanges
35995
+ });
35996
+ } else {
35997
+ if (expectedOutcome.length === 0) {
35998
+ logWarning2(
35999
+ `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
36000
+ );
36001
+ continue;
36002
+ }
36003
+ items.push({
36004
+ id,
36005
+ expected_outcome: expectedOutcome,
36006
+ weight,
36007
+ // Default to required: true if not specified (backward compatibility)
36008
+ required: required2 ?? true,
36009
+ ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
36010
+ });
36011
+ }
36012
+ }
36013
+ return items.length > 0 ? items : void 0;
36014
+ }
36015
+ function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
36016
+ const ranges = [];
36017
+ for (const [index, rawRange] of rawRanges.entries()) {
36018
+ if (!isJsonObject2(rawRange)) {
36019
+ throw new Error(
36020
+ `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
36021
+ );
36022
+ }
36023
+ const scoreRangeValue = rawRange.score_range;
36024
+ if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
36025
+ throw new Error(
36026
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
36027
+ );
36028
+ }
36029
+ const [min, max] = scoreRangeValue;
36030
+ if (!Number.isInteger(min) || !Number.isInteger(max)) {
36031
+ throw new Error(
36032
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
36033
+ );
36034
+ }
36035
+ if (min < 0 || min > 10 || max < 0 || max > 10) {
36036
+ throw new Error(
36037
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
36038
+ );
36039
+ }
36040
+ if (min > max) {
36041
+ throw new Error(
36042
+ `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
36043
+ );
36044
+ }
36045
+ const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
36046
+ if (expectedOutcome.length === 0) {
36047
+ throw new Error(
36048
+ `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
36049
+ );
36050
+ }
36051
+ ranges.push({
36052
+ score_range: [min, max],
36053
+ expected_outcome: expectedOutcome
36054
+ });
36055
+ }
36056
+ const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
36057
+ for (let i = 1; i < sortedRanges.length; i++) {
36058
+ const prev = sortedRanges[i - 1];
36059
+ const curr = sortedRanges[i];
36060
+ if (curr.score_range[0] <= prev.score_range[1]) {
36061
+ throw new Error(
36062
+ `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
36063
+ );
36064
+ }
36065
+ }
36066
+ const covered = /* @__PURE__ */ new Set();
36067
+ for (const range of ranges) {
36068
+ for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
36069
+ covered.add(i);
36070
+ }
36071
+ }
36072
+ const missing = [];
36073
+ for (let i = 0; i <= 10; i++) {
36074
+ if (!covered.has(i)) {
36075
+ missing.push(i);
36076
+ }
36077
+ }
36078
+ if (missing.length > 0) {
36079
+ throw new Error(
36080
+ `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
36081
+ );
36082
+ }
36083
+ return ranges;
36084
+ }
36085
+ function parseInlineRubrics(rawRubrics) {
36086
+ const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
36087
+ if (typeof rubric === "string") {
36088
+ return {
36089
+ id: `rubric-${index + 1}`,
36090
+ expected_outcome: rubric,
36091
+ weight: 1,
36092
+ required: true
36093
+ };
36094
+ }
36095
+ const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
36096
+ const rawScoreRanges = rubric.score_ranges;
36097
+ const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
36098
+ score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
36099
+ expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
36100
+ })).filter((r) => r.expected_outcome.length > 0) : void 0;
36101
+ const baseRubric = {
36102
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
36103
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1
36104
+ };
36105
+ if (scoreRanges && scoreRanges.length > 0) {
36106
+ return {
36107
+ ...baseRubric,
36108
+ ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
36109
+ ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
36110
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
36111
+ score_ranges: scoreRanges
36112
+ };
36113
+ }
36114
+ return {
36115
+ ...baseRubric,
36116
+ expected_outcome: expectedOutcome,
36117
+ required: typeof rubric.required === "boolean" ? rubric.required : true,
36118
+ ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
36119
+ };
36120
+ }).filter(
36121
+ (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
36122
+ );
36123
+ if (rubricItems.length === 0) {
36124
+ return void 0;
36125
+ }
36126
+ return {
36127
+ name: "rubric",
36128
+ type: "llm_judge",
36129
+ rubrics: rubricItems
36130
+ };
36131
+ }
35918
36132
  function formatFileContents(parts) {
35919
36133
  const fileCount = parts.filter((p) => p.isFile).length;
35920
36134
  if (fileCount > 0) {
@@ -36167,6 +36381,63 @@ async function processExpectedMessages(options) {
36167
36381
  }
36168
36382
  return segments;
36169
36383
  }
36384
+ function expandInputShorthand(value) {
36385
+ if (value === void 0 || value === null) {
36386
+ return void 0;
36387
+ }
36388
+ if (typeof value === "string") {
36389
+ return [{ role: "user", content: value }];
36390
+ }
36391
+ if (Array.isArray(value)) {
36392
+ const messages = value.filter((msg) => isTestMessage(msg));
36393
+ return messages.length > 0 ? messages : void 0;
36394
+ }
36395
+ return void 0;
36396
+ }
36397
+ function expandExpectedOutputShorthand(value) {
36398
+ if (value === void 0 || value === null) {
36399
+ return void 0;
36400
+ }
36401
+ if (typeof value === "string") {
36402
+ return [{ role: "assistant", content: value }];
36403
+ }
36404
+ if (Array.isArray(value)) {
36405
+ if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
36406
+ const messages = value.filter((msg) => isTestMessage(msg));
36407
+ return messages.length > 0 ? messages : void 0;
36408
+ }
36409
+ return [{ role: "assistant", content: value }];
36410
+ }
36411
+ if (isJsonObject(value)) {
36412
+ if ("role" in value) {
36413
+ return isTestMessage(value) ? [value] : void 0;
36414
+ }
36415
+ return [{ role: "assistant", content: value }];
36416
+ }
36417
+ return void 0;
36418
+ }
36419
+ function resolveInputMessages(raw) {
36420
+ if (raw.input_messages !== void 0) {
36421
+ if (Array.isArray(raw.input_messages)) {
36422
+ const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
36423
+ return messages.length > 0 ? messages : void 0;
36424
+ }
36425
+ return void 0;
36426
+ }
36427
+ return expandInputShorthand(raw.input);
36428
+ }
36429
+ function resolveExpectedMessages(raw) {
36430
+ if (raw.expected_messages !== void 0) {
36431
+ if (Array.isArray(raw.expected_messages)) {
36432
+ const messages = raw.expected_messages.filter(
36433
+ (msg) => isTestMessage(msg)
36434
+ );
36435
+ return messages.length > 0 ? messages : void 0;
36436
+ }
36437
+ return void 0;
36438
+ }
36439
+ return expandExpectedOutputShorthand(raw.expected_output);
36440
+ }
36170
36441
  var ANSI_YELLOW5 = "\x1B[33m";
36171
36442
  var ANSI_RED = "\x1B[31m";
36172
36443
  var ANSI_RESET5 = "\x1B[0m";
@@ -36226,7 +36497,7 @@ function parseJsonlContent(content, filePath) {
36226
36497
  }
36227
36498
  async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
36228
36499
  const verbose = options?.verbose ?? false;
36229
- const evalIdFilter = options?.evalId;
36500
+ const filterPattern = options?.filter;
36230
36501
  const absoluteTestPath = path52.resolve(evalFilePath);
36231
36502
  const repoRootPath = resolveToAbsolutePath(repoRoot);
36232
36503
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -36253,28 +36524,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
36253
36524
  const evalcase = rawCases[lineIndex];
36254
36525
  const lineNumber = lineIndex + 1;
36255
36526
  const id = asString4(evalcase.id);
36256
- if (evalIdFilter && id !== evalIdFilter) {
36527
+ if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
36257
36528
  continue;
36258
36529
  }
36259
36530
  const conversationId = asString4(evalcase.conversation_id);
36260
36531
  const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
36261
- const inputMessagesValue = evalcase.input_messages;
36262
- const expectedMessagesValue = evalcase.expected_messages;
36263
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
36532
+ const inputMessages = resolveInputMessages(evalcase);
36533
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
36534
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
36264
36535
  logError(
36265
- `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
36536
+ `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
36266
36537
  );
36267
36538
  continue;
36268
36539
  }
36269
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
36270
- const inputMessages = inputMessagesValue.filter(
36271
- (msg) => isTestMessage(msg)
36272
- );
36273
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
36274
- if (hasExpectedMessages && expectedMessages.length === 0) {
36275
- logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
36276
- continue;
36277
- }
36540
+ const hasExpectedMessages = expectedMessages.length > 0;
36278
36541
  const guidelinePaths = [];
36279
36542
  const inputTextParts = [];
36280
36543
  const inputSegments = await processMessages({
@@ -36320,28 +36583,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
36320
36583
  }
36321
36584
  const inlineRubrics = evalcase.rubrics;
36322
36585
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
36323
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
36324
- if (typeof rubric === "string") {
36325
- return {
36326
- id: `rubric-${index + 1}`,
36327
- description: rubric,
36328
- weight: 1,
36329
- required: true
36330
- };
36331
- }
36332
- return {
36333
- id: asString4(rubric.id) ?? `rubric-${index + 1}`,
36334
- description: asString4(rubric.description) ?? "",
36335
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
36336
- required: typeof rubric.required === "boolean" ? rubric.required : true
36337
- };
36338
- }).filter((r) => r.description.length > 0);
36339
- if (rubricItems.length > 0) {
36340
- const rubricEvaluator = {
36341
- name: "rubric",
36342
- type: "llm_judge",
36343
- rubrics: rubricItems
36344
- };
36586
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
36587
+ if (rubricEvaluator) {
36345
36588
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36346
36589
  }
36347
36590
  }
@@ -36645,7 +36888,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
36645
36888
  return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
36646
36889
  }
36647
36890
  const verbose = options?.verbose ?? false;
36648
- const evalIdFilter = options?.evalId;
36891
+ const filterPattern = options?.filter;
36649
36892
  const absoluteTestPath = path72.resolve(evalFilePath);
36650
36893
  const repoRootPath = resolveToAbsolutePath(repoRoot);
36651
36894
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -36675,28 +36918,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
36675
36918
  }
36676
36919
  const evalcase = rawEvalcase;
36677
36920
  const id = asString6(evalcase.id);
36678
- if (evalIdFilter && id !== evalIdFilter) {
36921
+ if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
36679
36922
  continue;
36680
36923
  }
36681
36924
  const conversationId = asString6(evalcase.conversation_id);
36682
36925
  const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
36683
- const inputMessagesValue = evalcase.input_messages;
36684
- const expectedMessagesValue = evalcase.expected_messages;
36685
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
36926
+ const inputMessages = resolveInputMessages(evalcase);
36927
+ const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
36928
+ if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
36686
36929
  logError2(
36687
- `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
36930
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
36688
36931
  );
36689
36932
  continue;
36690
36933
  }
36691
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
36692
- const inputMessages = inputMessagesValue.filter(
36693
- (msg) => isTestMessage(msg)
36694
- );
36695
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
36696
- if (hasExpectedMessages && expectedMessages.length === 0) {
36697
- logError2(`No valid expected message found for eval case: ${id}`);
36698
- continue;
36699
- }
36934
+ const hasExpectedMessages = expectedMessages.length > 0;
36700
36935
  const guidelinePaths = [];
36701
36936
  const inputTextParts = [];
36702
36937
  const inputSegments = await processMessages({
@@ -36740,28 +36975,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
36740
36975
  }
36741
36976
  const inlineRubrics = evalcase.rubrics;
36742
36977
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
36743
- const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
36744
- if (typeof rubric === "string") {
36745
- return {
36746
- id: `rubric-${index + 1}`,
36747
- description: rubric,
36748
- weight: 1,
36749
- required: true
36750
- };
36751
- }
36752
- return {
36753
- id: asString6(rubric.id) ?? `rubric-${index + 1}`,
36754
- description: asString6(rubric.description) ?? "",
36755
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
36756
- required: typeof rubric.required === "boolean" ? rubric.required : true
36757
- };
36758
- }).filter((r) => r.description.length > 0);
36759
- if (rubricItems.length > 0) {
36760
- const rubricEvaluator = {
36761
- name: "rubric",
36762
- type: "llm_judge",
36763
- rubrics: rubricItems
36764
- };
36978
+ const rubricEvaluator = parseInlineRubrics(inlineRubrics);
36979
+ if (rubricEvaluator) {
36765
36980
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36766
36981
  }
36767
36982
  }
@@ -37975,7 +38190,8 @@ var ToolCallSchema = external_exports.object({
37975
38190
  input: external_exports.unknown().optional(),
37976
38191
  output: external_exports.unknown().optional(),
37977
38192
  id: external_exports.string().optional(),
37978
- timestamp: external_exports.string().optional()
38193
+ timestamp: external_exports.string().optional(),
38194
+ duration_ms: external_exports.number().optional()
37979
38195
  });
37980
38196
  var OutputMessageInputSchema = external_exports.object({
37981
38197
  role: external_exports.string(),
@@ -37983,6 +38199,7 @@ var OutputMessageInputSchema = external_exports.object({
37983
38199
  content: external_exports.unknown().optional(),
37984
38200
  tool_calls: external_exports.array(ToolCallSchema).optional(),
37985
38201
  timestamp: external_exports.string().optional(),
38202
+ duration_ms: external_exports.number().optional(),
37986
38203
  metadata: external_exports.record(external_exports.unknown()).optional()
37987
38204
  });
37988
38205
  var TokenUsageSchema = external_exports.object({
@@ -38021,8 +38238,16 @@ function convertOutputMessages(messages) {
38021
38238
  role: msg.role,
38022
38239
  name: msg.name,
38023
38240
  content: msg.content,
38024
- toolCalls: msg.tool_calls,
38241
+ toolCalls: msg.tool_calls?.map((tc) => ({
38242
+ tool: tc.tool,
38243
+ input: tc.input,
38244
+ output: tc.output,
38245
+ id: tc.id,
38246
+ timestamp: tc.timestamp,
38247
+ durationMs: tc.duration_ms
38248
+ })),
38025
38249
  timestamp: msg.timestamp,
38250
+ durationMs: msg.duration_ms,
38026
38251
  metadata: msg.metadata
38027
38252
  }));
38028
38253
  }
@@ -41012,6 +41237,15 @@ var rubricEvaluationSchema = external_exports.object({
41012
41237
  checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
41013
41238
  overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
41014
41239
  });
41240
+ var scoreRangeCheckResultSchema = external_exports.object({
41241
+ id: external_exports.string().describe("The ID of the rubric criterion being scored"),
41242
+ score: external_exports.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
41243
+ reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this score").optional()
41244
+ });
41245
+ var scoreRangeEvaluationSchema = external_exports.object({
41246
+ checks: external_exports.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
41247
+ overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
41248
+ });
41015
41249
  var LlmJudgeEvaluator = class {
41016
41250
  kind = "llm_judge";
41017
41251
  resolveJudgeProvider;
@@ -41097,6 +41331,10 @@ var LlmJudgeEvaluator = class {
41097
41331
  `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
41098
41332
  );
41099
41333
  }
41334
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
41335
+ if (hasScoreRanges) {
41336
+ return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
41337
+ }
41100
41338
  const prompt = this.buildRubricPrompt(context, rubrics);
41101
41339
  const systemPrompt = buildRubricOutputSchema();
41102
41340
  const evaluatorRawRequest = {
@@ -41122,6 +41360,84 @@ var LlmJudgeEvaluator = class {
41122
41360
  evaluatorRawRequest
41123
41361
  };
41124
41362
  }
41363
+ /**
41364
+ * Evaluate using score-range rubrics (analytic rubric scoring).
41365
+ * Each criterion is scored 0-10 and normalized to 0-1.
41366
+ */
41367
+ async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
41368
+ const prompt = this.buildScoreRangePrompt(context, rubrics);
41369
+ const systemPrompt = buildScoreRangeOutputSchema();
41370
+ const evaluatorRawRequest = {
41371
+ userPrompt: prompt,
41372
+ systemPrompt,
41373
+ target: judgeProvider.targetName
41374
+ };
41375
+ const { data } = await this.runWithRetry({
41376
+ context,
41377
+ judgeProvider,
41378
+ systemPrompt,
41379
+ userPrompt: prompt,
41380
+ schema: scoreRangeEvaluationSchema
41381
+ });
41382
+ const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
41383
+ return {
41384
+ score,
41385
+ verdict,
41386
+ hits,
41387
+ misses,
41388
+ expectedAspectCount: rubrics.length,
41389
+ reasoning: data.overall_reasoning,
41390
+ evaluatorRawRequest,
41391
+ details
41392
+ };
41393
+ }
41394
+ /**
41395
+ * Build prompt for score-range rubric evaluation.
41396
+ */
41397
+ buildScoreRangePrompt(context, rubrics) {
41398
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
41399
+ const parts = [
41400
+ "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
41401
+ "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
41402
+ "",
41403
+ "[[ ## question ## ]]",
41404
+ formattedQuestion,
41405
+ "",
41406
+ "[[ ## expected_outcome ## ]]",
41407
+ context.evalCase.expected_outcome,
41408
+ ""
41409
+ ];
41410
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
41411
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
41412
+ }
41413
+ parts.push(
41414
+ "[[ ## candidate_answer ## ]]",
41415
+ context.candidate,
41416
+ "",
41417
+ "[[ ## scoring_criteria ## ]]"
41418
+ );
41419
+ for (const rubric of rubrics) {
41420
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
41421
+ const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
41422
+ parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
41423
+ if (rubric.expected_outcome) {
41424
+ parts.push(`Description: ${rubric.expected_outcome}`);
41425
+ }
41426
+ if (rubric.score_ranges && rubric.score_ranges.length > 0) {
41427
+ parts.push("Score ranges:");
41428
+ for (const range of rubric.score_ranges) {
41429
+ const [min, max] = range.score_range;
41430
+ const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
41431
+ parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
41432
+ }
41433
+ }
41434
+ }
41435
+ parts.push(
41436
+ "",
41437
+ "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
41438
+ );
41439
+ return parts.join("\n");
41440
+ }
41125
41441
  buildRubricPrompt(context, rubrics) {
41126
41442
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
41127
41443
  const parts = [
@@ -41141,7 +41457,7 @@ var LlmJudgeEvaluator = class {
41141
41457
  for (const rubric of rubrics) {
41142
41458
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
41143
41459
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
41144
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
41460
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
41145
41461
  }
41146
41462
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
41147
41463
  return parts.join("\n");
@@ -41228,9 +41544,9 @@ function calculateRubricScore(result, rubrics) {
41228
41544
  totalWeight += rubric.weight;
41229
41545
  if (check2.satisfied) {
41230
41546
  earnedWeight += rubric.weight;
41231
- hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
41547
+ hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
41232
41548
  } else {
41233
- misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
41549
+ misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
41234
41550
  if (rubric.required) {
41235
41551
  failedRequired = true;
41236
41552
  }
@@ -41240,6 +41556,76 @@ function calculateRubricScore(result, rubrics) {
41240
41556
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
41241
41557
  return { score, verdict, hits, misses };
41242
41558
  }
41559
+ function buildScoreRangeOutputSchema() {
41560
+ return `You are an expert evaluator. Score the candidate answer on each criterion.
41561
+ You must return a valid JSON object matching this schema:
41562
+ {
41563
+ "checks": [
41564
+ {
41565
+ "id": "string (criterion id)",
41566
+ "score": integer (0-10),
41567
+ "reasoning": "string (brief explanation for score)"
41568
+ }
41569
+ ],
41570
+ "overall_reasoning": "string (summary, optional)"
41571
+ }
41572
+
41573
+ Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
41574
+ }
41575
+ function calculateScoreRangeResult(result, rubrics) {
41576
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
41577
+ const hits = [];
41578
+ const misses = [];
41579
+ const rawScores = {};
41580
+ let totalWeight = 0;
41581
+ let weightedScoreSum = 0;
41582
+ let failedRequired = false;
41583
+ for (const check2 of result.checks) {
41584
+ const rubric = rubricMap.get(check2.id);
41585
+ if (!rubric) {
41586
+ continue;
41587
+ }
41588
+ const rawScore = Math.max(0, Math.min(10, check2.score));
41589
+ const normalizedScore = rawScore / 10;
41590
+ rawScores[rubric.id] = rawScore;
41591
+ totalWeight += rubric.weight;
41592
+ weightedScoreSum += normalizedScore * rubric.weight;
41593
+ let requiredMinScore;
41594
+ if (rubric.required_min_score !== void 0) {
41595
+ requiredMinScore = rubric.required_min_score;
41596
+ } else if (rubric.required === true) {
41597
+ requiredMinScore = 10;
41598
+ }
41599
+ const matchingRange = rubric.score_ranges?.find(
41600
+ (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
41601
+ );
41602
+ const rangeDescription = matchingRange?.expected_outcome ?? "";
41603
+ const criterionLabel = rubric.expected_outcome ?? rubric.id;
41604
+ const reasoningText = check2.reasoning ? `: ${check2.reasoning}` : "";
41605
+ const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
41606
+ if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
41607
+ failedRequired = true;
41608
+ misses.push(scoreInfo);
41609
+ } else if (rawScore >= 7) {
41610
+ hits.push(scoreInfo);
41611
+ } else {
41612
+ misses.push(scoreInfo);
41613
+ }
41614
+ }
41615
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
41616
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
41617
+ return {
41618
+ score,
41619
+ verdict,
41620
+ hits,
41621
+ misses,
41622
+ details: {
41623
+ raw_scores: rawScores,
41624
+ normalization: "score / 10",
41625
+ aggregation: "weighted_average"
41626
+ }
41627
+ };
41628
+ }
41243
41629
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
41244
41630
  {{EVALUATOR_RESULTS_JSON}}
41245
41631
 
@@ -42055,6 +42441,27 @@ function argsMatch(expected, actual) {
42055
42441
  }
42056
42442
  return true;
42057
42443
  }
42444
+ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
42445
+ if (maxDurationMs === void 0) {
42446
+ return { status: "skip", message: "" };
42447
+ }
42448
+ if (actualDurationMs === void 0) {
42449
+ return {
42450
+ status: "skip",
42451
+ message: `No duration data for ${toolName}; latency assertion skipped`
42452
+ };
42453
+ }
42454
+ if (actualDurationMs <= maxDurationMs) {
42455
+ return {
42456
+ status: "pass",
42457
+ message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
42458
+ };
42459
+ }
42460
+ return {
42461
+ status: "fail",
42462
+ message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
42463
+ };
42464
+ }
42058
42465
  var ToolTrajectoryEvaluator = class {
42059
42466
  kind = "tool_trajectory";
42060
42467
  config;
@@ -42113,7 +42520,8 @@ var ToolTrajectoryEvaluator = class {
42113
42520
  for (const call of message.toolCalls) {
42114
42521
  toolCalls.push({
42115
42522
  name: call.tool,
42116
- args: call.input
42523
+ args: call.input,
42524
+ durationMs: call.durationMs
42117
42525
  });
42118
42526
  }
42119
42527
  }
@@ -42181,17 +42589,27 @@ var ToolTrajectoryEvaluator = class {
42181
42589
  }
42182
42590
  const hits = [];
42183
42591
  const misses = [];
42592
+ const warnings = [];
42184
42593
  let actualIndex = 0;
42594
+ let sequenceHits = 0;
42595
+ let latencyHits = 0;
42596
+ let latencySkips = 0;
42597
+ const latencyAssertionCount = expected.filter(
42598
+ (item) => item.maxDurationMs !== void 0
42599
+ ).length;
42185
42600
  for (let i = 0; i < expected.length; i++) {
42186
42601
  const expectedItem = expected[i];
42187
42602
  const expectedTool = expectedItem.tool;
42188
42603
  let found = false;
42189
42604
  let argsMismatch = false;
42605
+ let matchedCall;
42190
42606
  while (actualIndex < toolCalls.length) {
42191
42607
  const actualCall = toolCalls[actualIndex];
42192
42608
  if (actualCall.name === expectedTool) {
42193
42609
  if (argsMatch(expectedItem.args, actualCall.args)) {
42194
42610
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
42611
+ sequenceHits++;
42612
+ matchedCall = actualCall;
42195
42613
  actualIndex++;
42196
42614
  found = true;
42197
42615
  break;
@@ -42208,14 +42626,35 @@ var ToolTrajectoryEvaluator = class {
42208
42626
  if (!found && !argsMismatch) {
42209
42627
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
42210
42628
  }
42629
+ if (found && matchedCall) {
42630
+ const latencyResult = checkLatency(
42631
+ expectedTool,
42632
+ expectedItem.maxDurationMs,
42633
+ matchedCall.durationMs
42634
+ );
42635
+ if (latencyResult.status === "pass") {
42636
+ hits.push(latencyResult.message);
42637
+ latencyHits++;
42638
+ } else if (latencyResult.status === "fail") {
42639
+ misses.push(latencyResult.message);
42640
+ } else if (latencyResult.message) {
42641
+ warnings.push(latencyResult.message);
42642
+ latencySkips++;
42643
+ }
42644
+ }
42645
+ }
42646
+ for (const warning of warnings) {
42647
+ console.warn(`[tool_trajectory] ${warning}`);
42211
42648
  }
42212
- const score = hits.length / expected.length;
42649
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
42650
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
42651
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
42213
42652
  return {
42214
42653
  score,
42215
42654
  verdict: scoreToVerdict(score),
42216
42655
  hits,
42217
42656
  misses,
42218
- expectedAspectCount: expected.length
42657
+ expectedAspectCount: totalAssertions
42219
42658
  };
42220
42659
  }
42221
42660
  evaluateExact(toolCalls) {
@@ -42231,6 +42670,13 @@ var ToolTrajectoryEvaluator = class {
42231
42670
  }
42232
42671
  const hits = [];
42233
42672
  const misses = [];
42673
+ const warnings = [];
42674
+ let sequenceHits = 0;
42675
+ let latencyHits = 0;
42676
+ let latencySkips = 0;
42677
+ const latencyAssertionCount = expected.filter(
42678
+ (item) => item.maxDurationMs !== void 0
42679
+ ).length;
42234
42680
  if (toolCalls.length !== expected.length) {
42235
42681
  misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
42236
42682
  }
@@ -42240,26 +42686,50 @@ var ToolTrajectoryEvaluator = class {
42240
42686
  const expectedTool = expectedItem.tool;
42241
42687
  const actualCall = toolCalls[i];
42242
42688
  const actualTool = actualCall.name;
42689
+ let sequenceMatched = false;
42243
42690
  if (actualTool === expectedTool) {
42244
42691
  if (argsMatch(expectedItem.args, actualCall.args)) {
42245
42692
  hits.push(`Position ${i}: ${expectedTool}`);
42693
+ sequenceHits++;
42694
+ sequenceMatched = true;
42246
42695
  } else {
42247
42696
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
42248
42697
  }
42249
42698
  } else {
42250
42699
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
42251
42700
  }
42701
+ if (sequenceMatched) {
42702
+ const latencyResult = checkLatency(
42703
+ expectedTool,
42704
+ expectedItem.maxDurationMs,
42705
+ actualCall.durationMs
42706
+ );
42707
+ if (latencyResult.status === "pass") {
42708
+ hits.push(latencyResult.message);
42709
+ latencyHits++;
42710
+ } else if (latencyResult.status === "fail") {
42711
+ misses.push(latencyResult.message);
42712
+ } else if (latencyResult.message) {
42713
+ warnings.push(latencyResult.message);
42714
+ latencySkips++;
42715
+ }
42716
+ }
42252
42717
  }
42253
42718
  for (let i = checkLength; i < expected.length; i++) {
42254
42719
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
42255
42720
  }
42256
- const score = hits.length / expected.length;
42721
+ for (const warning of warnings) {
42722
+ console.warn(`[tool_trajectory] ${warning}`);
42723
+ }
42724
+ const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
42725
+ const totalAssertions = expected.length + effectiveLatencyAssertions;
42726
+ const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
42257
42727
  return {
42258
42728
  score,
42259
42729
  verdict: scoreToVerdict(score),
42260
42730
  hits,
42261
42731
  misses,
42262
- expectedAspectCount: expected.length
42732
+ expectedAspectCount: totalAssertions
42263
42733
  };
42264
42734
  }
42265
42735
  };
@@ -42415,17 +42885,17 @@ async function runEvaluation(options) {
42415
42885
  cache,
42416
42886
  useCache,
42417
42887
  now,
42418
- evalId,
42888
+ filter: filter2,
42419
42889
  verbose,
42420
42890
  evalCases: preloadedEvalCases,
42421
42891
  onResult,
42422
42892
  onProgress
42423
42893
  } = options;
42424
- const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
42425
- const filteredEvalCases = filterEvalCases(evalCases, evalId);
42894
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter: filter2 });
42895
+ const filteredEvalCases = filterEvalCases(evalCases, filter2);
42426
42896
  if (filteredEvalCases.length === 0) {
42427
- if (evalId) {
42428
- throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
42897
+ if (filter2) {
42898
+ throw new Error(`No eval cases matched filter '${filter2}' in ${evalFilePath}`);
42429
42899
  }
42430
42900
  return [];
42431
42901
  }
@@ -43001,7 +43471,10 @@ async function runEvaluatorList(options) {
43001
43471
  attempt,
43002
43472
  promptInputs,
43003
43473
  now,
43004
- judgeProvider
43474
+ judgeProvider,
43475
+ outputMessages,
43476
+ traceSummary,
43477
+ agentTimeoutMs
43005
43478
  });
43006
43479
  const weight = evaluator.weight ?? 1;
43007
43480
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -43336,9 +43809,22 @@ async function runLlmJudgeEvaluator(options) {
43336
43809
  attempt,
43337
43810
  promptInputs,
43338
43811
  now,
43339
- judgeProvider
43812
+ judgeProvider,
43813
+ outputMessages,
43814
+ traceSummary,
43815
+ agentTimeoutMs
43340
43816
  } = options;
43341
- const customPrompt = await resolveCustomPrompt(config2);
43817
+ const customPrompt = await resolveCustomPrompt(
43818
+ config2,
43819
+ {
43820
+ evalCase,
43821
+ candidate,
43822
+ outputMessages,
43823
+ traceSummary,
43824
+ config: config2.config
43825
+ },
43826
+ agentTimeoutMs
43827
+ );
43342
43828
  return evaluatorRegistry.llm_judge.evaluate({
43343
43829
  evalCase,
43344
43830
  candidate,
@@ -43352,23 +43838,70 @@ async function runLlmJudgeEvaluator(options) {
43352
43838
  evaluator: config2
43353
43839
  });
43354
43840
  }
43355
- async function resolveCustomPrompt(config2) {
43356
- if (config2.promptPath) {
43841
+ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
43842
+ if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
43843
+ if (!context) {
43844
+ throw new Error("Context required for executable prompt templates");
43845
+ }
43846
+ return executePromptTemplate(
43847
+ promptConfig.resolvedPromptScript,
43848
+ context,
43849
+ promptConfig.config,
43850
+ timeoutMs
43851
+ );
43852
+ }
43853
+ const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
43854
+ if (promptPath) {
43357
43855
  try {
43358
- const content = await readTextFile(config2.promptPath);
43856
+ const content = await readTextFile(promptPath);
43359
43857
  return content;
43360
43858
  } catch (error40) {
43361
43859
  const message = error40 instanceof Error ? error40.message : String(error40);
43362
- console.warn(`Could not read custom prompt at ${config2.promptPath}: ${message}`);
43860
+ console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
43861
+ }
43862
+ }
43863
+ const promptValue = promptConfig.prompt;
43864
+ if (typeof promptValue === "string") {
43865
+ return promptValue;
43866
+ }
43867
+ return void 0;
43868
+ }
43869
+ async function executePromptTemplate(script, context, config2, timeoutMs) {
43870
+ const payload = {
43871
+ question: context.evalCase.question,
43872
+ expectedOutcome: context.evalCase.expected_outcome,
43873
+ expectedMessages: context.evalCase.expected_messages,
43874
+ referenceAnswer: context.evalCase.reference_answer,
43875
+ candidateAnswer: context.candidate,
43876
+ outputMessages: context.outputMessages ?? null,
43877
+ guidelineFiles: context.evalCase.guideline_paths,
43878
+ inputFiles: context.evalCase.file_paths.filter(
43879
+ (p) => !context.evalCase.guideline_paths.includes(p)
43880
+ ),
43881
+ inputMessages: context.evalCase.input_messages,
43882
+ traceSummary: context.traceSummary ?? null,
43883
+ config: config2 ?? context.config ?? null
43884
+ };
43885
+ const inputJson = JSON.stringify(toSnakeCaseDeep2(payload), null, 2);
43886
+ const scriptPath = script[script.length - 1];
43887
+ const cwd = path15.dirname(scriptPath);
43888
+ try {
43889
+ const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
43890
+ const prompt = stdout.trim();
43891
+ if (!prompt) {
43892
+ throw new Error("Prompt template produced empty output");
43363
43893
  }
43894
+ return prompt;
43895
+ } catch (error40) {
43896
+ const message = error40 instanceof Error ? error40.message : String(error40);
43897
+ throw new Error(`Prompt template execution failed: ${message}`);
43364
43898
  }
43365
- return config2.prompt;
43366
43899
  }
43367
- function filterEvalCases(evalCases, evalId) {
43368
- if (!evalId) {
43900
+ function filterEvalCases(evalCases, filter2) {
43901
+ if (!filter2) {
43369
43902
  return evalCases;
43370
43903
  }
43371
- return evalCases.filter((evalCase) => evalCase.id === evalId);
43904
+ return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
43372
43905
  }
43373
43906
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
43374
43907
  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -43522,7 +44055,7 @@ function computeWeightedMean(entries) {
43522
44055
  }
43523
44056
  var rubricItemSchema = external_exports.object({
43524
44057
  id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
43525
- description: external_exports.string().describe("What this rubric checks for"),
44058
+ expected_outcome: external_exports.string().describe("Concrete expected outcome for this rubric item"),
43526
44059
  weight: external_exports.number().default(1).describe("Relative importance (default 1.0)"),
43527
44060
  required: external_exports.boolean().default(true).describe("Whether this is a mandatory requirement")
43528
44061
  });
@@ -43542,7 +44075,7 @@ You must return a valid JSON object matching this schema:
43542
44075
  "rubrics": [
43543
44076
  {
43544
44077
  "id": "string (short identifier)",
43545
- "description": "string (what to check)",
44078
+ "expected_outcome": "string (concrete expected outcome for this rubric item)",
43546
44079
  "weight": number (default 1.0),
43547
44080
  "required": boolean (default true)
43548
44081
  }
@@ -43578,7 +44111,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
43578
44111
  "Each rubric should:",
43579
44112
  "- Be specific and testable",
43580
44113
  "- Have a short, descriptive ID",
43581
- "- Include a clear description of what to check",
44114
+ "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
43582
44115
  "- Indicate if it is required (mandatory) or optional",
43583
44116
  "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
43584
44117
  "",
@@ -44439,17 +44972,31 @@ async function validateEvalFile(filePath) {
44439
44972
  });
44440
44973
  }
44441
44974
  const inputMessages = evalCase.input_messages;
44442
- if (!Array.isArray(inputMessages)) {
44975
+ const inputAlias = evalCase.input;
44976
+ if (Array.isArray(inputMessages)) {
44977
+ validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
44978
+ } else if (inputAlias !== void 0) {
44979
+ if (typeof inputAlias === "string") {
44980
+ } else if (Array.isArray(inputAlias)) {
44981
+ validateMessages(inputAlias, `${location}.input`, absolutePath, errors);
44982
+ } else {
44983
+ errors.push({
44984
+ severity: "error",
44985
+ filePath: absolutePath,
44986
+ location: `${location}.input`,
44987
+ message: "Invalid 'input' field (must be a string or array of messages)"
44988
+ });
44989
+ }
44990
+ } else {
44443
44991
  errors.push({
44444
44992
  severity: "error",
44445
44993
  filePath: absolutePath,
44446
44994
  location: `${location}.input_messages`,
44447
- message: "Missing or invalid 'input_messages' field (must be an array)"
44995
+ message: "Missing 'input_messages' or 'input' field (must provide one)"
44448
44996
  });
44449
- } else {
44450
- validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
44451
44997
  }
44452
44998
  const expectedMessages = evalCase.expected_messages;
44999
+ const expectedOutputAlias = evalCase.expected_output;
44453
45000
  if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
44454
45001
  errors.push({
44455
45002
  severity: "error",
@@ -44459,6 +45006,26 @@ async function validateEvalFile(filePath) {
44459
45006
  });
44460
45007
  } else if (Array.isArray(expectedMessages)) {
44461
45008
  validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
45009
+ } else if (expectedOutputAlias !== void 0) {
45010
+ if (typeof expectedOutputAlias === "string") {
45011
+ } else if (Array.isArray(expectedOutputAlias)) {
45012
+ if (expectedOutputAlias.length > 0 && isObject2(expectedOutputAlias[0]) && "role" in expectedOutputAlias[0]) {
45013
+ validateMessages(
45014
+ expectedOutputAlias,
45015
+ `${location}.expected_output`,
45016
+ absolutePath,
45017
+ errors
45018
+ );
45019
+ }
45020
+ } else if (isObject2(expectedOutputAlias)) {
45021
+ } else {
45022
+ errors.push({
45023
+ severity: "error",
45024
+ filePath: absolutePath,
45025
+ location: `${location}.expected_output`,
45026
+ message: "Invalid 'expected_output' field (must be a string, object, or array)"
45027
+ });
45028
+ }
44462
45029
  }
44463
45030
  }
44464
45031
  return {
@@ -45302,7 +45869,7 @@ function normalizeOptions(rawOptions) {
45302
45869
  return {
45303
45870
  target: normalizeString(rawOptions.target),
45304
45871
  targetsPath: normalizeString(rawOptions.targets),
45305
- evalId: normalizeString(rawOptions.evalId),
45872
+ filter: normalizeString(rawOptions.filter),
45306
45873
  workers: workers > 0 ? workers : void 0,
45307
45874
  outPath: normalizeString(rawOptions.out),
45308
45875
  format,
@@ -45427,9 +45994,9 @@ async function prepareFileMetadata(params) {
45427
45994
  const inlineTargetLabel = `${selection.targetName} [provider=${providerLabel}]`;
45428
45995
  const evalCases = await loadEvalCases(testFilePath, repoRoot, {
45429
45996
  verbose: options.verbose,
45430
- evalId: options.evalId
45997
+ filter: options.filter
45431
45998
  });
45432
- const filteredIds = options.evalId ? evalCases.filter((value) => value.id === options.evalId).map((value) => value.id) : evalCases.map((value) => value.id);
45999
+ const filteredIds = evalCases.map((value) => value.id);
45433
46000
  return { evalIds: filteredIds, evalCases, selection, inlineTargetLabel };
45434
46001
  }
45435
46002
  async function runWithLimit(items, limit, task) {
@@ -45500,7 +46067,6 @@ async function runSingleEvalFile(params) {
45500
46067
  agentTimeoutMs,
45501
46068
  cache,
45502
46069
  useCache: options.cache,
45503
- evalId: options.evalId,
45504
46070
  evalCases,
45505
46071
  verbose: options.verbose,
45506
46072
  maxConcurrency: resolvedWorkers,
@@ -45676,7 +46242,7 @@ var evalCommand = command3({
45676
46242
  evalId: option3({
45677
46243
  type: optional4(string6),
45678
46244
  long: "eval-id",
45679
- description: "Run only the eval case with this identifier"
46245
+ description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")'
45680
46246
  }),
45681
46247
  workers: option3({
45682
46248
  type: number5,
@@ -45743,7 +46309,7 @@ var evalCommand = command3({
45743
46309
  const rawOptions = {
45744
46310
  target: args.target,
45745
46311
  targets: args.targets,
45746
- evalId: args.evalId,
46312
+ filter: args.evalId,
45747
46313
  workers: args.workers,
45748
46314
  out: args.out,
45749
46315
  outputFormat: args.outputFormat,
@@ -45902,14 +46468,12 @@ async function generateRubricsCommand(options) {
45902
46468
  if (caseNode && isMap(caseNode)) {
45903
46469
  caseNode.set(
45904
46470
  "rubrics",
45905
- rubrics.map(
45906
- (r) => ({
45907
- id: r.id,
45908
- description: r.description,
45909
- weight: r.weight,
45910
- required: r.required
45911
- })
45912
- )
46471
+ rubrics.filter((r) => r.expected_outcome !== void 0).map((r) => ({
46472
+ id: r.id,
46473
+ expected_outcome: r.expected_outcome,
46474
+ weight: r.weight,
46475
+ required: r.required ?? true
46476
+ }))
45913
46477
  );
45914
46478
  }
45915
46479
  updatedCount++;
@@ -46454,4 +47018,4 @@ export {
46454
47018
  app,
46455
47019
  runCli
46456
47020
  };
46457
- //# sourceMappingURL=chunk-5HTT24MQ.js.map
47021
+ //# sourceMappingURL=chunk-XREH4WAJ.js.map