@tangle-network/agent-eval 0.58.1 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-ZWEQJIM6.js +220 -0
- package/dist/chunk-ZWEQJIM6.js.map +1 -0
- package/dist/contract/index.js +18 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/index.js +5 -103
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
- package/dist/chunk-SHTXZ4O2.js +0 -113
- package/dist/chunk-SHTXZ4O2.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import {
|
|
2
|
+
BackendIntegrityError,
|
|
2
3
|
HoldoutAuditor,
|
|
4
|
+
assertRealBackend,
|
|
3
5
|
canaryLeakView,
|
|
4
6
|
checkBehavioralCanary,
|
|
5
7
|
checkCanaries,
|
|
6
|
-
runBehavioralCanaries
|
|
7
|
-
|
|
8
|
+
runBehavioralCanaries,
|
|
9
|
+
summarizeBackendIntegrity
|
|
10
|
+
} from "./chunk-ZWEQJIM6.js";
|
|
8
11
|
import {
|
|
9
12
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
10
13
|
DEFAULT_RED_TEAM_CORPUS,
|
|
@@ -3532,107 +3535,6 @@ function canonicalize2(value) {
|
|
|
3532
3535
|
return out;
|
|
3533
3536
|
}
|
|
3534
3537
|
|
|
3535
|
-
// src/integrity/backend-integrity.ts
|
|
3536
|
-
var BackendIntegrityError = class extends AgentEvalError {
|
|
3537
|
-
constructor(message, report) {
|
|
3538
|
-
super("backend_integrity", message);
|
|
3539
|
-
this.report = report;
|
|
3540
|
-
}
|
|
3541
|
-
report;
|
|
3542
|
-
};
|
|
3543
|
-
function isStubRecord(rec) {
|
|
3544
|
-
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
|
|
3545
|
-
}
|
|
3546
|
-
function isUncostedRecord(rec) {
|
|
3547
|
-
return rec.tokenUsage.output > 0 && rec.costUsd === 0;
|
|
3548
|
-
}
|
|
3549
|
-
function summarizeBackendIntegrity(records) {
|
|
3550
|
-
const totalRecords = records.length;
|
|
3551
|
-
let stubRecords = 0;
|
|
3552
|
-
let realRecords = 0;
|
|
3553
|
-
let uncostedRecords = 0;
|
|
3554
|
-
let totalInputTokens = 0;
|
|
3555
|
-
let totalOutputTokens = 0;
|
|
3556
|
-
let totalCostUsd = 0;
|
|
3557
|
-
for (const rec of records) {
|
|
3558
|
-
totalInputTokens += rec.tokenUsage.input;
|
|
3559
|
-
totalOutputTokens += rec.tokenUsage.output;
|
|
3560
|
-
totalCostUsd += rec.costUsd;
|
|
3561
|
-
if (isStubRecord(rec)) stubRecords++;
|
|
3562
|
-
else realRecords++;
|
|
3563
|
-
if (isUncostedRecord(rec)) uncostedRecords++;
|
|
3564
|
-
}
|
|
3565
|
-
const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
|
|
3566
|
-
const diagnosis = buildDiagnosis({
|
|
3567
|
-
totalRecords,
|
|
3568
|
-
stubRecords,
|
|
3569
|
-
realRecords,
|
|
3570
|
-
uncostedRecords,
|
|
3571
|
-
totalInputTokens,
|
|
3572
|
-
totalOutputTokens,
|
|
3573
|
-
totalCostUsd,
|
|
3574
|
-
verdict
|
|
3575
|
-
});
|
|
3576
|
-
return {
|
|
3577
|
-
totalRecords,
|
|
3578
|
-
stubRecords,
|
|
3579
|
-
realRecords,
|
|
3580
|
-
uncostedRecords,
|
|
3581
|
-
totalInputTokens,
|
|
3582
|
-
totalOutputTokens,
|
|
3583
|
-
totalCostUsd,
|
|
3584
|
-
verdict,
|
|
3585
|
-
diagnosis
|
|
3586
|
-
};
|
|
3587
|
-
}
|
|
3588
|
-
function buildDiagnosis(r) {
|
|
3589
|
-
if (r.totalRecords === 0) {
|
|
3590
|
-
return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
|
|
3591
|
-
}
|
|
3592
|
-
if (r.verdict === "stub") {
|
|
3593
|
-
return [
|
|
3594
|
-
`all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
|
|
3595
|
-
"common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
|
|
3596
|
-
"auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
|
|
3597
|
-
"or boot the cli-bridge / sandbox before invoking the eval."
|
|
3598
|
-
].join(" ");
|
|
3599
|
-
}
|
|
3600
|
-
if (r.verdict === "mixed") {
|
|
3601
|
-
const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
|
|
3602
|
-
return [
|
|
3603
|
-
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
|
|
3604
|
-
"common causes: rate-limit cascade (429s after the first N personas);",
|
|
3605
|
-
"transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
|
|
3606
|
-
].join(" ");
|
|
3607
|
-
}
|
|
3608
|
-
if (r.uncostedRecords > 0) {
|
|
3609
|
-
const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
|
|
3610
|
-
return [
|
|
3611
|
-
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
|
|
3612
|
-
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
|
|
3613
|
-
"propagation from the runtime stream into RunRecord)."
|
|
3614
|
-
].join(" ");
|
|
3615
|
-
}
|
|
3616
|
-
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
|
|
3617
|
-
}
|
|
3618
|
-
function assertRealBackend(records, opts = {}) {
|
|
3619
|
-
const report = summarizeBackendIntegrity(records);
|
|
3620
|
-
const allowMixed = opts.allowMixed ?? true;
|
|
3621
|
-
if (report.verdict === "stub") {
|
|
3622
|
-
throw new BackendIntegrityError(
|
|
3623
|
-
`backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
|
|
3624
|
-
report
|
|
3625
|
-
);
|
|
3626
|
-
}
|
|
3627
|
-
if (!allowMixed && report.verdict === "mixed") {
|
|
3628
|
-
throw new BackendIntegrityError(
|
|
3629
|
-
`backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
|
|
3630
|
-
report
|
|
3631
|
-
);
|
|
3632
|
-
}
|
|
3633
|
-
return report;
|
|
3634
|
-
}
|
|
3635
|
-
|
|
3636
3538
|
// src/integrity/single-backend.ts
|
|
3637
3539
|
var SingleBackendError = class extends AgentEvalError {
|
|
3638
3540
|
constructor(message, report) {
|