@agentv/core 4.1.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PXYYRDHH.js → chunk-V6QVGHVD.js} +1 -1
- package/dist/chunk-V6QVGHVD.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +17 -26
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +21 -6
- package/dist/index.d.ts +21 -6
- package/dist/index.js +17 -27
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PXYYRDHH.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1402,6 +1402,7 @@ __export(index_exports, {
|
|
|
1402
1402
|
OtelStreamingObserver: () => OtelStreamingObserver,
|
|
1403
1403
|
OtelTraceExporter: () => OtelTraceExporter,
|
|
1404
1404
|
OtlpJsonFileExporter: () => OtlpJsonFileExporter,
|
|
1405
|
+
PASS_THRESHOLD: () => PASS_THRESHOLD,
|
|
1405
1406
|
ProviderRegistry: () => ProviderRegistry,
|
|
1406
1407
|
RepoManager: () => RepoManager,
|
|
1407
1408
|
ResponseCache: () => ResponseCache,
|
|
@@ -13309,14 +13310,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
13309
13310
|
}
|
|
13310
13311
|
|
|
13311
13312
|
// src/evaluation/evaluators/scoring.ts
|
|
13313
|
+
var PASS_THRESHOLD = 0.8;
|
|
13312
13314
|
function scoreToVerdict(score) {
|
|
13313
|
-
|
|
13314
|
-
return "pass";
|
|
13315
|
-
}
|
|
13316
|
-
if (score >= 0.6) {
|
|
13317
|
-
return "borderline";
|
|
13318
|
-
}
|
|
13319
|
-
return "fail";
|
|
13315
|
+
return score >= PASS_THRESHOLD ? "pass" : "fail";
|
|
13320
13316
|
}
|
|
13321
13317
|
function clampScore(value) {
|
|
13322
13318
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -13382,13 +13378,16 @@ function deepEqual(a, b) {
|
|
|
13382
13378
|
if (aKeys.length !== bKeys.length) return false;
|
|
13383
13379
|
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
13384
13380
|
}
|
|
13381
|
+
var NEGATED_VERDICT = {
|
|
13382
|
+
pass: "fail",
|
|
13383
|
+
fail: "pass",
|
|
13384
|
+
skip: "skip"
|
|
13385
|
+
};
|
|
13385
13386
|
function negateScore(score) {
|
|
13386
|
-
const negatedScore = clampScore(1 - score.score);
|
|
13387
|
-
const negatedVerdict = score.verdict === "pass" ? "fail" : score.verdict === "fail" ? "pass" : "borderline";
|
|
13388
13387
|
return {
|
|
13389
13388
|
...score,
|
|
13390
|
-
score:
|
|
13391
|
-
verdict:
|
|
13389
|
+
score: clampScore(1 - score.score),
|
|
13390
|
+
verdict: NEGATED_VERDICT[score.verdict],
|
|
13392
13391
|
assertions: score.assertions.map((a) => ({
|
|
13393
13392
|
...a,
|
|
13394
13393
|
passed: !a.passed,
|
|
@@ -15105,7 +15104,7 @@ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation resul
|
|
|
15105
15104
|
{{EVALUATOR_RESULTS_JSON}}
|
|
15106
15105
|
|
|
15107
15106
|
Decide the final score and verdict based on all evaluator results.
|
|
15108
|
-
Return a JSON object with: score (0.0-1.0), verdict (pass/fail
|
|
15107
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
|
|
15109
15108
|
var CompositeEvaluator = class {
|
|
15110
15109
|
kind = "composite";
|
|
15111
15110
|
config;
|
|
@@ -15219,7 +15218,7 @@ var CompositeEvaluator = class {
|
|
|
15219
15218
|
continue;
|
|
15220
15219
|
}
|
|
15221
15220
|
evaluatedCount++;
|
|
15222
|
-
const isPassing = member.result.verdict === "pass"
|
|
15221
|
+
const isPassing = member.result.verdict === "pass";
|
|
15223
15222
|
if (isPassing) {
|
|
15224
15223
|
passingCount++;
|
|
15225
15224
|
}
|
|
@@ -15284,7 +15283,7 @@ var CompositeEvaluator = class {
|
|
|
15284
15283
|
passed: Boolean(a.passed),
|
|
15285
15284
|
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
15286
15285
|
})) : [];
|
|
15287
|
-
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail"
|
|
15286
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail") ? parsed.verdict : scoreToVerdict(score);
|
|
15288
15287
|
return {
|
|
15289
15288
|
score,
|
|
15290
15289
|
verdict,
|
|
@@ -18500,9 +18499,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
18500
18499
|
}
|
|
18501
18500
|
|
|
18502
18501
|
// src/evaluation/orchestrator.ts
|
|
18503
|
-
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
18504
18502
|
function classifyQualityStatus(score) {
|
|
18505
|
-
return score >=
|
|
18503
|
+
return score >= PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
18506
18504
|
}
|
|
18507
18505
|
function buildSkippedEvaluatorError(scores) {
|
|
18508
18506
|
const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
|
|
@@ -20250,7 +20248,6 @@ async function runEvaluatorList(options) {
|
|
|
20250
20248
|
}
|
|
20251
20249
|
}
|
|
20252
20250
|
}
|
|
20253
|
-
const PASS_THRESHOLD = 0.8;
|
|
20254
20251
|
const hasRequiredFailure = scored.some((entry) => {
|
|
20255
20252
|
if (!entry.required) return false;
|
|
20256
20253
|
const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
|
|
@@ -20627,24 +20624,17 @@ function mapAssertionType(type) {
|
|
|
20627
20624
|
function computeSummary(results, durationMs) {
|
|
20628
20625
|
const total = results.length;
|
|
20629
20626
|
let passed = 0;
|
|
20630
|
-
let failed = 0;
|
|
20631
|
-
let borderline = 0;
|
|
20632
20627
|
let scoreSum = 0;
|
|
20633
20628
|
for (const r of results) {
|
|
20634
20629
|
scoreSum += r.score;
|
|
20635
|
-
if (r.score >=
|
|
20630
|
+
if (r.score >= PASS_THRESHOLD) {
|
|
20636
20631
|
passed++;
|
|
20637
|
-
} else if (r.score < 0.5) {
|
|
20638
|
-
failed++;
|
|
20639
|
-
} else {
|
|
20640
|
-
borderline++;
|
|
20641
20632
|
}
|
|
20642
20633
|
}
|
|
20643
20634
|
return {
|
|
20644
20635
|
total,
|
|
20645
20636
|
passed,
|
|
20646
|
-
failed,
|
|
20647
|
-
borderline,
|
|
20637
|
+
failed: total - passed,
|
|
20648
20638
|
durationMs,
|
|
20649
20639
|
meanScore: total > 0 ? scoreSum / total : 0
|
|
20650
20640
|
};
|
|
@@ -21474,6 +21464,7 @@ function createAgentKernel() {
|
|
|
21474
21464
|
OtelStreamingObserver,
|
|
21475
21465
|
OtelTraceExporter,
|
|
21476
21466
|
OtlpJsonFileExporter,
|
|
21467
|
+
PASS_THRESHOLD,
|
|
21477
21468
|
ProviderRegistry,
|
|
21478
21469
|
RepoManager,
|
|
21479
21470
|
ResponseCache,
|