agentv 0.25.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -253,9 +253,9 @@ Code evaluators receive input via stdin and write output to stdout as JSON.
253
253
  "expected_outcome": "expected outcome description",
254
254
  "reference_answer": "gold standard answer (optional)",
255
255
  "candidate_answer": "generated code/text from the agent",
256
- "guideline_paths": ["path1", "path2"],
257
- "input_files": ["file1", "file2"],
258
- "input_segments": [{"type": "text", "value": "..."}]
256
+ "guideline_files": ["path/to/guideline1.md", "path/to/guideline2.md"],
257
+ "input_files": ["path/to/data.json", "path/to/config.yaml"],
258
+ "input_messages": [{"role": "user", "content": "..."}]
259
259
  }
260
260
  ```
261
261
 
@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
164
164
  import path19 from "node:path";
165
165
  import { pathToFileURL } from "node:url";
166
166
 
167
- // ../../packages/core/dist/chunk-OYTL3LNN.js
167
+ // ../../packages/core/dist/chunk-V3JCB3HI.js
168
168
  import { constants } from "node:fs";
169
169
  import { access, readFile } from "node:fs/promises";
170
170
  import path from "node:path";
@@ -4211,7 +4211,7 @@ var coerce = {
4211
4211
  };
4212
4212
  var NEVER = INVALID;
4213
4213
 
4214
- // ../../packages/core/dist/chunk-OYTL3LNN.js
4214
+ // ../../packages/core/dist/chunk-V3JCB3HI.js
4215
4215
  async function fileExists(filePath) {
4216
4216
  try {
4217
4217
  await access(filePath, constants.F_OK);
@@ -4470,7 +4470,10 @@ function resolveAzureConfig(target, env) {
4470
4470
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
4471
4471
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
4472
4472
  const version2 = normalizeAzureApiVersion(
4473
- resolveOptionalString(versionSource, env, `${target.name} api version`)
4473
+ resolveOptionalString(versionSource, env, `${target.name} api version`, {
4474
+ allowLiteral: true,
4475
+ optionalEnv: true
4476
+ })
4474
4477
  );
4475
4478
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
4476
4479
  const maxOutputTokens = resolveOptionalNumber(
@@ -34575,7 +34578,7 @@ var EVALUATOR_KIND_VALUES = [
34575
34578
  "rubric",
34576
34579
  "composite",
34577
34580
  "tool_trajectory",
34578
- "expected_messages"
34581
+ "expected_tool_calls"
34579
34582
  ];
34580
34583
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
34581
34584
  function isEvaluatorKind(value) {
@@ -34928,6 +34931,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34928
34931
  logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
34929
34932
  continue;
34930
34933
  }
34934
+ const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
34931
34935
  const cwd = asString2(rawEvaluator.cwd);
34932
34936
  let resolvedCwd;
34933
34937
  if (cwd) {
@@ -34948,7 +34952,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34948
34952
  type: "code",
34949
34953
  script,
34950
34954
  cwd,
34951
- resolvedCwd
34955
+ resolvedCwd,
34956
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
34952
34957
  });
34953
34958
  continue;
34954
34959
  }
@@ -35043,18 +35048,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35043
35048
  ...promptPath2 ? { promptPath: promptPath2 } : {}
35044
35049
  };
35045
35050
  }
35051
+ const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35046
35052
  evaluators.push({
35047
35053
  name: name16,
35048
35054
  type: "composite",
35049
35055
  evaluators: memberEvaluators,
35050
- aggregator
35056
+ aggregator,
35057
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
35051
35058
  });
35052
35059
  continue;
35053
35060
  }
35054
- if (typeValue === "expected_messages") {
35061
+ if (typeValue === "expected_tool_calls") {
35062
+ const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35055
35063
  evaluators.push({
35056
35064
  name: name16,
35057
- type: "expected_messages"
35065
+ type: "expected_tool_calls",
35066
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
35058
35067
  });
35059
35068
  continue;
35060
35069
  }
@@ -35110,12 +35119,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35110
35119
  );
35111
35120
  continue;
35112
35121
  }
35122
+ const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35113
35123
  const config2 = {
35114
35124
  name: name16,
35115
35125
  type: "tool_trajectory",
35116
35126
  mode,
35117
35127
  ...minimums ? { minimums } : {},
35118
- ...expected ? { expected } : {}
35128
+ ...expected ? { expected } : {},
35129
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
35119
35130
  };
35120
35131
  evaluators.push(config2);
35121
35132
  continue;
@@ -35156,19 +35167,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35156
35167
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
35157
35168
  continue;
35158
35169
  }
35170
+ const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35159
35171
  evaluators.push({
35160
35172
  name: name16,
35161
35173
  type: "llm_judge",
35162
- rubrics: parsedRubrics
35174
+ rubrics: parsedRubrics,
35175
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
35163
35176
  });
35164
35177
  continue;
35165
35178
  }
35179
+ const weight = validateWeight(rawEvaluator.weight, name16, evalId);
35166
35180
  evaluators.push({
35167
35181
  name: name16,
35168
35182
  type: "llm_judge",
35169
35183
  prompt,
35170
35184
  promptPath,
35171
- ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
35185
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
35186
+ ...weight !== void 0 ? { weight } : {}
35172
35187
  });
35173
35188
  }
35174
35189
  return evaluators.length > 0 ? evaluators : void 0;
@@ -35198,6 +35213,27 @@ ${detailBlock}${ANSI_RESET3}`);
35198
35213
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
35199
35214
  }
35200
35215
  }
35216
+ function validateWeight(rawWeight, evaluatorName, evalId) {
35217
+ if (rawWeight === void 0) {
35218
+ return void 0;
35219
+ }
35220
+ if (typeof rawWeight !== "number") {
35221
+ throw new Error(
35222
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
35223
+ );
35224
+ }
35225
+ if (!Number.isFinite(rawWeight)) {
35226
+ throw new Error(
35227
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
35228
+ );
35229
+ }
35230
+ if (rawWeight < 0) {
35231
+ throw new Error(
35232
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
35233
+ );
35234
+ }
35235
+ return rawWeight;
35236
+ }
35201
35237
  var ANSI_YELLOW4 = "\x1B[33m";
35202
35238
  var ANSI_RESET4 = "\x1B[0m";
35203
35239
  async function processMessages(options) {
@@ -37906,9 +37942,11 @@ var CodeEvaluator = class {
37906
37942
  expected_outcome: context.evalCase.expected_outcome,
37907
37943
  reference_answer: context.evalCase.reference_answer,
37908
37944
  candidate_answer: context.candidate,
37909
- guideline_paths: context.evalCase.guideline_paths,
37910
- input_files: context.evalCase.file_paths,
37911
- input_segments: context.evalCase.input_segments
37945
+ guideline_files: context.evalCase.guideline_paths,
37946
+ input_files: context.evalCase.file_paths.filter(
37947
+ (path132) => !context.evalCase.guideline_paths.includes(path132)
37948
+ ),
37949
+ input_messages: context.evalCase.input_messages
37912
37950
  },
37913
37951
  null,
37914
37952
  2
@@ -38174,8 +38212,8 @@ var ToolTrajectoryEvaluator = class {
38174
38212
  };
38175
38213
  }
38176
38214
  };
38177
- var ExpectedMessagesEvaluator = class {
38178
- kind = "expected_messages";
38215
+ var ExpectedToolCallsEvaluator = class {
38216
+ kind = "expected_tool_calls";
38179
38217
  evaluate(context) {
38180
38218
  const { candidateTrace, evalCase } = context;
38181
38219
  const expectedSegments = evalCase.expected_segments;
@@ -39070,14 +39108,12 @@ async function evaluateCandidate(options) {
39070
39108
  } else {
39071
39109
  if (promptInputs.chatPrompt) {
39072
39110
  lmProviderRequest = {
39073
- chat_prompt: promptInputs.chatPrompt,
39074
- guideline_paths: evalCase.guideline_paths
39111
+ chat_prompt: promptInputs.chatPrompt
39075
39112
  };
39076
39113
  } else {
39077
39114
  lmProviderRequest = {
39078
39115
  question: promptInputs.question,
39079
- guidelines: promptInputs.guidelines,
39080
- guideline_paths: evalCase.guideline_paths
39116
+ guidelines: promptInputs.guidelines
39081
39117
  };
39082
39118
  }
39083
39119
  }
@@ -39184,11 +39220,13 @@ async function runEvaluatorList(options) {
39184
39220
  now,
39185
39221
  judgeProvider
39186
39222
  });
39187
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
39223
+ const weight = evaluator.weight ?? 1;
39224
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
39188
39225
  evaluatorResults.push({
39189
39226
  name: evaluator.name,
39190
39227
  type: evaluator.type,
39191
39228
  score: score2.score,
39229
+ weight,
39192
39230
  verdict: score2.verdict,
39193
39231
  hits: score2.hits,
39194
39232
  misses: score2.misses,
@@ -39211,11 +39249,13 @@ async function runEvaluatorList(options) {
39211
39249
  promptInputs,
39212
39250
  now
39213
39251
  });
39214
- scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
39252
+ const weight = evaluator.weight ?? 1;
39253
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
39215
39254
  evaluatorResults.push({
39216
39255
  name: evaluator.name,
39217
39256
  type: "code_judge",
39218
39257
  score: score2.score,
39258
+ weight,
39219
39259
  verdict: score2.verdict,
39220
39260
  hits: score2.hits,
39221
39261
  misses: score2.misses,
@@ -39245,8 +39285,8 @@ async function runEvaluatorList(options) {
39245
39285
  return new ToolTrajectoryEvaluator({
39246
39286
  config: memberConfig
39247
39287
  });
39248
- case "expected_messages":
39249
- return new ExpectedMessagesEvaluator();
39288
+ case "expected_tool_calls":
39289
+ return new ExpectedToolCallsEvaluator();
39250
39290
  default: {
39251
39291
  const unknownConfig = memberConfig;
39252
39292
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -39268,11 +39308,13 @@ async function runEvaluatorList(options) {
39268
39308
  now,
39269
39309
  judgeProvider
39270
39310
  });
39271
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
39311
+ const weight = evaluator.weight ?? 1;
39312
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
39272
39313
  evaluatorResults.push({
39273
39314
  name: evaluator.name,
39274
39315
  type: evaluator.type,
39275
39316
  score: score2.score,
39317
+ weight,
39276
39318
  verdict: score2.verdict,
39277
39319
  hits: score2.hits,
39278
39320
  misses: score2.misses,
@@ -39296,20 +39338,22 @@ async function runEvaluatorList(options) {
39296
39338
  candidateTrace,
39297
39339
  candidateTraceSummary
39298
39340
  });
39299
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
39341
+ const weight = evaluator.weight ?? 1;
39342
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
39300
39343
  evaluatorResults.push({
39301
39344
  name: evaluator.name,
39302
39345
  type: evaluator.type,
39303
39346
  score: score2.score,
39347
+ weight,
39304
39348
  verdict: score2.verdict,
39305
39349
  hits: score2.hits,
39306
39350
  misses: score2.misses,
39307
39351
  reasoning: score2.reasoning
39308
39352
  });
39309
39353
  }
39310
- if (evaluator.type === "expected_messages") {
39311
- const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
39312
- const score2 = expectedMessagesEvaluator.evaluate({
39354
+ if (evaluator.type === "expected_tool_calls") {
39355
+ const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
39356
+ const score2 = expectedToolCallsEvaluator.evaluate({
39313
39357
  evalCase,
39314
39358
  candidate,
39315
39359
  target,
@@ -39320,11 +39364,13 @@ async function runEvaluatorList(options) {
39320
39364
  candidateTrace,
39321
39365
  candidateTraceSummary
39322
39366
  });
39323
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
39367
+ const weight = evaluator.weight ?? 1;
39368
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
39324
39369
  evaluatorResults.push({
39325
39370
  name: evaluator.name,
39326
39371
  type: evaluator.type,
39327
39372
  score: score2.score,
39373
+ weight,
39328
39374
  verdict: score2.verdict,
39329
39375
  hits: score2.hits,
39330
39376
  misses: score2.misses,
@@ -39342,15 +39388,18 @@ async function runEvaluatorList(options) {
39342
39388
  reasoning: message
39343
39389
  };
39344
39390
  const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
39391
+ const weight = evaluator.weight ?? 1;
39345
39392
  scored.push({
39346
39393
  score: fallbackScore,
39347
39394
  name: evaluator.name ?? "unknown",
39348
- type: resultType ?? "llm_judge"
39395
+ type: resultType ?? "llm_judge",
39396
+ weight
39349
39397
  });
39350
39398
  evaluatorResults.push({
39351
39399
  name: evaluator.name ?? "unknown",
39352
39400
  type: resultType ?? "llm_judge",
39353
39401
  score: 0,
39402
+ weight,
39354
39403
  verdict: "fail",
39355
39404
  hits: [],
39356
39405
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -39358,7 +39407,9 @@ async function runEvaluatorList(options) {
39358
39407
  });
39359
39408
  }
39360
39409
  }
39361
- const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
39410
+ const aggregateScore = scored.length > 0 ? computeWeightedMean(
39411
+ scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
39412
+ ) : 0;
39362
39413
  const hits = scored.flatMap((entry) => entry.score.hits);
39363
39414
  const misses = scored.flatMap((entry) => entry.score.misses);
39364
39415
  const expectedAspectCount = scored.reduce(
@@ -39584,6 +39635,16 @@ function mapChildResults(children) {
39584
39635
  evaluator_results: mapChildResults(child.evaluatorResults)
39585
39636
  }));
39586
39637
  }
39638
+ function computeWeightedMean(entries) {
39639
+ let totalWeight = 0;
39640
+ let weightedSum = 0;
39641
+ for (const entry of entries) {
39642
+ const weight = entry.weight ?? 1;
39643
+ totalWeight += weight;
39644
+ weightedSum += entry.score * weight;
39645
+ }
39646
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
39647
+ }
39587
39648
  var rubricItemSchema = external_exports.object({
39588
39649
  id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
39589
39650
  description: external_exports.string().describe("What this rubric checks for"),
@@ -42647,4 +42708,4 @@ export {
42647
42708
  app,
42648
42709
  runCli
42649
42710
  };
42650
- //# sourceMappingURL=chunk-ZVSFP6NK.js.map
42711
+ //# sourceMappingURL=chunk-RIJO5WBF.js.map