@agentv/core 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ import {
9
9
  isEvaluatorKind,
10
10
  loadCasesFromFile,
11
11
  resolveFileReference
12
- } from "../../chunk-PXYYRDHH.js";
12
+ } from "../../chunk-V6QVGHVD.js";
13
13
 
14
14
  // src/evaluation/validation/file-type.ts
15
15
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1402,6 +1402,7 @@ __export(index_exports, {
1402
1402
  OtelStreamingObserver: () => OtelStreamingObserver,
1403
1403
  OtelTraceExporter: () => OtelTraceExporter,
1404
1404
  OtlpJsonFileExporter: () => OtlpJsonFileExporter,
1405
+ PASS_THRESHOLD: () => PASS_THRESHOLD,
1405
1406
  ProviderRegistry: () => ProviderRegistry,
1406
1407
  RepoManager: () => RepoManager,
1407
1408
  ResponseCache: () => ResponseCache,
@@ -13309,14 +13310,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
13309
13310
  }
13310
13311
 
13311
13312
  // src/evaluation/evaluators/scoring.ts
13313
+ var PASS_THRESHOLD = 0.8;
13312
13314
  function scoreToVerdict(score) {
13313
- if (score >= 0.8) {
13314
- return "pass";
13315
- }
13316
- if (score >= 0.6) {
13317
- return "borderline";
13318
- }
13319
- return "fail";
13315
+ return score >= PASS_THRESHOLD ? "pass" : "fail";
13320
13316
  }
13321
13317
  function clampScore(value) {
13322
13318
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -13382,13 +13378,16 @@ function deepEqual(a, b) {
13382
13378
  if (aKeys.length !== bKeys.length) return false;
13383
13379
  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
13384
13380
  }
13381
+ var NEGATED_VERDICT = {
13382
+ pass: "fail",
13383
+ fail: "pass",
13384
+ skip: "skip"
13385
+ };
13385
13386
  function negateScore(score) {
13386
- const negatedScore = clampScore(1 - score.score);
13387
- const negatedVerdict = score.verdict === "pass" ? "fail" : score.verdict === "fail" ? "pass" : "borderline";
13388
13387
  return {
13389
13388
  ...score,
13390
- score: negatedScore,
13391
- verdict: negatedVerdict,
13389
+ score: clampScore(1 - score.score),
13390
+ verdict: NEGATED_VERDICT[score.verdict],
13392
13391
  assertions: score.assertions.map((a) => ({
13393
13392
  ...a,
13394
13393
  passed: !a.passed,
@@ -15105,7 +15104,7 @@ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation resul
15105
15104
  {{EVALUATOR_RESULTS_JSON}}
15106
15105
 
15107
15106
  Decide the final score and verdict based on all evaluator results.
15108
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
15107
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
15109
15108
  var CompositeEvaluator = class {
15110
15109
  kind = "composite";
15111
15110
  config;
@@ -15219,7 +15218,7 @@ var CompositeEvaluator = class {
15219
15218
  continue;
15220
15219
  }
15221
15220
  evaluatedCount++;
15222
- const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
15221
+ const isPassing = member.result.verdict === "pass";
15223
15222
  if (isPassing) {
15224
15223
  passingCount++;
15225
15224
  }
@@ -15284,7 +15283,7 @@ var CompositeEvaluator = class {
15284
15283
  passed: Boolean(a.passed),
15285
15284
  ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
15286
15285
  })) : [];
15287
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
15286
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail") ? parsed.verdict : scoreToVerdict(score);
15288
15287
  return {
15289
15288
  score,
15290
15289
  verdict,
@@ -18500,9 +18499,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
18500
18499
  }
18501
18500
 
18502
18501
  // src/evaluation/orchestrator.ts
18503
- var QUALITY_PASS_THRESHOLD = 0.8;
18504
18502
  function classifyQualityStatus(score) {
18505
- return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
18503
+ return score >= PASS_THRESHOLD ? "ok" : "quality_failure";
18506
18504
  }
18507
18505
  function buildSkippedEvaluatorError(scores) {
18508
18506
  const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
@@ -20250,7 +20248,6 @@ async function runEvaluatorList(options) {
20250
20248
  }
20251
20249
  }
20252
20250
  }
20253
- const PASS_THRESHOLD = 0.8;
20254
20251
  const hasRequiredFailure = scored.some((entry) => {
20255
20252
  if (!entry.required) return false;
20256
20253
  const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
@@ -20627,24 +20624,17 @@ function mapAssertionType(type) {
20627
20624
  function computeSummary(results, durationMs) {
20628
20625
  const total = results.length;
20629
20626
  let passed = 0;
20630
- let failed = 0;
20631
- let borderline = 0;
20632
20627
  let scoreSum = 0;
20633
20628
  for (const r of results) {
20634
20629
  scoreSum += r.score;
20635
- if (r.score >= 0.8) {
20630
+ if (r.score >= PASS_THRESHOLD) {
20636
20631
  passed++;
20637
- } else if (r.score < 0.5) {
20638
- failed++;
20639
- } else {
20640
- borderline++;
20641
20632
  }
20642
20633
  }
20643
20634
  return {
20644
20635
  total,
20645
20636
  passed,
20646
- failed,
20647
- borderline,
20637
+ failed: total - passed,
20648
20638
  durationMs,
20649
20639
  meanScore: total > 0 ? scoreSum / total : 0
20650
20640
  };
@@ -21474,6 +21464,7 @@ function createAgentKernel() {
21474
21464
  OtelStreamingObserver,
21475
21465
  OtelTraceExporter,
21476
21466
  OtlpJsonFileExporter,
21467
+ PASS_THRESHOLD,
21477
21468
  ProviderRegistry,
21478
21469
  RepoManager,
21479
21470
  ResponseCache,