@tangle-network/agent-eval 0.59.1 → 0.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +1 -1
- package/dist/adapters/otel.d.ts +5 -5
- package/dist/adapters/otel.js +1 -1
- package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +3 -3
- package/dist/campaign/index.d.ts +153 -9
- package/dist/campaign/index.js +229 -23
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
- package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
- package/dist/chunk-3BFEG2F6.js.map +1 -0
- package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
- package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
- package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
- package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
- package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
- package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
- package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
- package/dist/chunk-GMXHLSLL.js.map +1 -0
- package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
- package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
- package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
- package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
- package/dist/chunk-OLULBECP.js.map +1 -0
- package/dist/chunk-PQV2TKC3.js +27 -0
- package/dist/chunk-PQV2TKC3.js.map +1 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
- package/dist/chunk-SUGME4OT.js.map +1 -0
- package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
- package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
- package/dist/cli.js +4 -4
- package/dist/contract/index.d.ts +48 -16
- package/dist/contract/index.js +59 -19
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +4 -4
- package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
- package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
- package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -3
- package/dist/governance/index.js +1 -1
- package/dist/hosted/index.d.ts +5 -5
- package/dist/hosted/index.js +1 -1
- package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
- package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
- package/dist/index.d.ts +24 -132
- package/dist/index.js +23 -36
- package/dist/index.js.map +1 -1
- package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
- package/dist/knowledge/index.js +1 -1
- package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +4 -4
- package/dist/prm/index.js +1 -1
- package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
- package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
- package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
- package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
- package/dist/reporting.d.ts +6 -6
- package/dist/reporting.js +5 -5
- package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
- package/dist/rl.d.ts +9 -9
- package/dist/rl.js +8 -8
- package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
- package/dist/run-campaign-HXPJAUZ3.js +10 -0
- package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
- package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +4 -4
- package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
- package/dist/wire/index.d.ts +3 -3
- package/dist/wire/index.js +4 -4
- package/package.json +1 -1
- package/dist/chunk-74Y2EMNH.js.map +0 -1
- package/dist/chunk-JB4UWIM6.js.map +0 -1
- package/dist/chunk-N4SBKEPJ.js.map +0 -1
- package/dist/chunk-NSBPE2FW.js +0 -17
- package/dist/chunk-QYJT52YW.js.map +0 -1
- package/dist/chunk-ZWEQJIM6.js +0 -220
- package/dist/chunk-ZWEQJIM6.js.map +0 -1
- package/dist/run-campaign-ZURVWMMI.js +0 -10
- /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
- /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
- /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
- /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
- /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
- /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
- /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
- /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
- /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
- /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
- /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
- /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
- /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
- /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
- /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
package/dist/chunk-ZWEQJIM6.js
DELETED
|
@@ -1,220 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
llmSpans
|
|
3
|
-
} from "./chunk-47X6LRCE.js";
|
|
4
|
-
import {
|
|
5
|
-
AgentEvalError
|
|
6
|
-
} from "./chunk-QYJT52YW.js";
|
|
7
|
-
|
|
8
|
-
// src/integrity/backend-integrity.ts
|
|
9
|
-
var BackendIntegrityError = class extends AgentEvalError {
|
|
10
|
-
constructor(message, report) {
|
|
11
|
-
super("backend_integrity", message);
|
|
12
|
-
this.report = report;
|
|
13
|
-
}
|
|
14
|
-
report;
|
|
15
|
-
};
|
|
16
|
-
function isStubRecord(rec) {
|
|
17
|
-
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
|
|
18
|
-
}
|
|
19
|
-
function isUncostedRecord(rec) {
|
|
20
|
-
return rec.tokenUsage.output > 0 && rec.costUsd === 0;
|
|
21
|
-
}
|
|
22
|
-
function summarizeBackendIntegrity(records) {
|
|
23
|
-
const totalRecords = records.length;
|
|
24
|
-
let stubRecords = 0;
|
|
25
|
-
let realRecords = 0;
|
|
26
|
-
let uncostedRecords = 0;
|
|
27
|
-
let totalInputTokens = 0;
|
|
28
|
-
let totalOutputTokens = 0;
|
|
29
|
-
let totalCostUsd = 0;
|
|
30
|
-
for (const rec of records) {
|
|
31
|
-
totalInputTokens += rec.tokenUsage.input;
|
|
32
|
-
totalOutputTokens += rec.tokenUsage.output;
|
|
33
|
-
totalCostUsd += rec.costUsd;
|
|
34
|
-
if (isStubRecord(rec)) stubRecords++;
|
|
35
|
-
else realRecords++;
|
|
36
|
-
if (isUncostedRecord(rec)) uncostedRecords++;
|
|
37
|
-
}
|
|
38
|
-
const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
|
|
39
|
-
const diagnosis = buildDiagnosis({
|
|
40
|
-
totalRecords,
|
|
41
|
-
stubRecords,
|
|
42
|
-
realRecords,
|
|
43
|
-
uncostedRecords,
|
|
44
|
-
totalInputTokens,
|
|
45
|
-
totalOutputTokens,
|
|
46
|
-
totalCostUsd,
|
|
47
|
-
verdict
|
|
48
|
-
});
|
|
49
|
-
return {
|
|
50
|
-
totalRecords,
|
|
51
|
-
stubRecords,
|
|
52
|
-
realRecords,
|
|
53
|
-
uncostedRecords,
|
|
54
|
-
totalInputTokens,
|
|
55
|
-
totalOutputTokens,
|
|
56
|
-
totalCostUsd,
|
|
57
|
-
verdict,
|
|
58
|
-
diagnosis
|
|
59
|
-
};
|
|
60
|
-
}
|
|
61
|
-
function buildDiagnosis(r) {
|
|
62
|
-
if (r.totalRecords === 0) {
|
|
63
|
-
return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
|
|
64
|
-
}
|
|
65
|
-
if (r.verdict === "stub") {
|
|
66
|
-
return [
|
|
67
|
-
`all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
|
|
68
|
-
"common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
|
|
69
|
-
"auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
|
|
70
|
-
"or boot the cli-bridge / sandbox before invoking the eval."
|
|
71
|
-
].join(" ");
|
|
72
|
-
}
|
|
73
|
-
if (r.verdict === "mixed") {
|
|
74
|
-
const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
|
|
75
|
-
return [
|
|
76
|
-
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
|
|
77
|
-
"common causes: rate-limit cascade (429s after the first N personas);",
|
|
78
|
-
"transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
|
|
79
|
-
].join(" ");
|
|
80
|
-
}
|
|
81
|
-
if (r.uncostedRecords > 0) {
|
|
82
|
-
const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
|
|
83
|
-
return [
|
|
84
|
-
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
|
|
85
|
-
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
|
|
86
|
-
"propagation from the runtime stream into RunRecord)."
|
|
87
|
-
].join(" ");
|
|
88
|
-
}
|
|
89
|
-
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
|
|
90
|
-
}
|
|
91
|
-
function assertRealBackend(records, opts = {}) {
|
|
92
|
-
const report = summarizeBackendIntegrity(records);
|
|
93
|
-
const allowMixed = opts.allowMixed ?? true;
|
|
94
|
-
if (report.verdict === "stub") {
|
|
95
|
-
throw new BackendIntegrityError(
|
|
96
|
-
`backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
|
|
97
|
-
report
|
|
98
|
-
);
|
|
99
|
-
}
|
|
100
|
-
if (!allowMixed && report.verdict === "mixed") {
|
|
101
|
-
throw new BackendIntegrityError(
|
|
102
|
-
`backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
|
|
103
|
-
report
|
|
104
|
-
);
|
|
105
|
-
}
|
|
106
|
-
return report;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// src/contamination-guard.ts
|
|
110
|
-
function checkCanaries(output, scenarios) {
|
|
111
|
-
const leaks = [];
|
|
112
|
-
for (const s of scenarios) {
|
|
113
|
-
if (!s.canary) continue;
|
|
114
|
-
if (output.includes(s.canary)) {
|
|
115
|
-
leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) });
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
return leaks;
|
|
119
|
-
}
|
|
120
|
-
function checkBehavioralCanary(output, scenario) {
|
|
121
|
-
const pattern = scenario.forbiddenPattern ?? scenario.canary;
|
|
122
|
-
if (!pattern) return null;
|
|
123
|
-
const hit = matchForbidden(output, pattern);
|
|
124
|
-
if (!hit) return null;
|
|
125
|
-
return {
|
|
126
|
-
scenarioId: scenario.id,
|
|
127
|
-
canary: pattern,
|
|
128
|
-
evidence: excerpt(output, hit)
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
|
-
function runBehavioralCanaries(cases) {
|
|
132
|
-
const leaks = [];
|
|
133
|
-
for (const c of cases) {
|
|
134
|
-
const leak = checkBehavioralCanary(c.output, c.scenario);
|
|
135
|
-
if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
|
|
136
|
-
}
|
|
137
|
-
return leaks;
|
|
138
|
-
}
|
|
139
|
-
function matchForbidden(output, pattern) {
|
|
140
|
-
const re = tryParseRegex(pattern);
|
|
141
|
-
if (re) {
|
|
142
|
-
const m = output.match(re);
|
|
143
|
-
return m && m[0].length > 0 ? m[0] : null;
|
|
144
|
-
}
|
|
145
|
-
return output.includes(pattern) ? pattern : null;
|
|
146
|
-
}
|
|
147
|
-
function tryParseRegex(pattern) {
|
|
148
|
-
if (pattern.length < 2 || pattern[0] !== "/") return null;
|
|
149
|
-
const last = pattern.lastIndexOf("/");
|
|
150
|
-
if (last <= 0) return null;
|
|
151
|
-
const body = pattern.slice(1, last);
|
|
152
|
-
const flags = pattern.slice(last + 1);
|
|
153
|
-
if (!/^[gimsuy]*$/.test(flags)) return null;
|
|
154
|
-
try {
|
|
155
|
-
return new RegExp(body, flags);
|
|
156
|
-
} catch {
|
|
157
|
-
return null;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
async function canaryLeakView(store, scenarios) {
|
|
161
|
-
const targets = scenarios.filter((s) => !!s.canary);
|
|
162
|
-
if (targets.length === 0) return [];
|
|
163
|
-
const spans = await llmSpans(store);
|
|
164
|
-
const leaks = [];
|
|
165
|
-
for (const span of spans) {
|
|
166
|
-
const output = span.output ?? "";
|
|
167
|
-
for (const s of targets) {
|
|
168
|
-
if (s.canary && output.includes(s.canary)) {
|
|
169
|
-
leaks.push({
|
|
170
|
-
scenarioId: s.id,
|
|
171
|
-
canary: s.canary,
|
|
172
|
-
runId: span.runId,
|
|
173
|
-
evidence: excerpt(output, s.canary)
|
|
174
|
-
});
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
return leaks;
|
|
179
|
-
}
|
|
180
|
-
var HoldoutAuditor = class {
|
|
181
|
-
scenarios;
|
|
182
|
-
accessLog = [];
|
|
183
|
-
constructor(scenarios) {
|
|
184
|
-
this.scenarios = scenarios;
|
|
185
|
-
}
|
|
186
|
-
/** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
|
|
187
|
-
get(scenarioId, purpose) {
|
|
188
|
-
if (purpose !== "evaluation" && purpose !== "debugging") {
|
|
189
|
-
throw new Error(
|
|
190
|
-
`HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`
|
|
191
|
-
);
|
|
192
|
-
}
|
|
193
|
-
const s = this.scenarios.find((x) => x.id === scenarioId);
|
|
194
|
-
if (!s) throw new Error(`holdout scenario "${scenarioId}" not found`);
|
|
195
|
-
this.accessLog.push({ scenarioId, purpose, at: Date.now() });
|
|
196
|
-
return s;
|
|
197
|
-
}
|
|
198
|
-
getAccessLog() {
|
|
199
|
-
return this.accessLog;
|
|
200
|
-
}
|
|
201
|
-
};
|
|
202
|
-
function excerpt(source, needle) {
|
|
203
|
-
const at = source.indexOf(needle);
|
|
204
|
-
if (at < 0) return "";
|
|
205
|
-
const start = Math.max(0, at - 30);
|
|
206
|
-
const end = Math.min(source.length, at + needle.length + 30);
|
|
207
|
-
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
export {
|
|
211
|
-
BackendIntegrityError,
|
|
212
|
-
summarizeBackendIntegrity,
|
|
213
|
-
assertRealBackend,
|
|
214
|
-
checkCanaries,
|
|
215
|
-
checkBehavioralCanary,
|
|
216
|
-
runBehavioralCanaries,
|
|
217
|
-
canaryLeakView,
|
|
218
|
-
HoldoutAuditor
|
|
219
|
-
};
|
|
220
|
-
//# sourceMappingURL=chunk-ZWEQJIM6.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/integrity/backend-integrity.ts","../src/contamination-guard.ts"],"sourcesContent":["/**\n * Backend-integrity guard: distinguish \"agent failed\" from \"eval ran against\n * a stub / unconfigured backend.\" Without this guard a canonical eval can\n * silently report `0/N passed` and look like an agent-quality problem when\n * the LLM was never actually called — the failure mode we just hit running\n * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104\n * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).\n *\n * The shape:\n *\n * const report = summarizeBackendIntegrity(records)\n * assertRealBackend(records) // throws BackendIntegrityError if 100% stub\n *\n * A record is \"stub-mode\" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.\n * (`costUsd` alone is unreliable — some backends successfully call LLMs but\n * don't propagate pricing, producing real tokens with $0 cost.)\n *\n * Verdicts:\n * - `real` — at least one record has nonzero token usage\n * - `stub` — every record is stub-mode (eval ran blind)\n * - `mixed` — some records real, some stub (partial backend failure;\n * often the 429-cascade or auth-half-failed case)\n */\n\nimport { AgentEvalError } from '../errors'\nimport type { RunRecord } from '../run-record'\n\nexport interface BackendIntegrityReport {\n /** Total records inspected. */\n totalRecords: number\n /** Records with input=0 AND output=0 (a stub fingerprint). */\n stubRecords: number\n /** Records with nonzero token usage (real LLM activity). */\n realRecords: number\n /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */\n uncostedRecords: number\n /** Sum of input tokens across all records. */\n totalInputTokens: number\n /** Sum of output tokens across all records. */\n totalOutputTokens: number\n /** Sum of costUsd across all records. */\n totalCostUsd: number\n /** Worst-case integrity verdict. */\n verdict: 'real' | 'mixed' | 'stub'\n /** Human-readable diagnosis suitable for terminal output. */\n diagnosis: string\n}\n\n/**\n * Error thrown when an integrity assertion fails. Caller can pattern-match\n * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other\n * errors.\n */\nexport class BackendIntegrityError extends AgentEvalError {\n constructor(\n message: string,\n public readonly report: BackendIntegrityReport,\n ) {\n super('backend_integrity', message)\n }\n}\n\nfunction isStubRecord(rec: RunRecord): boolean {\n return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0\n}\n\nfunction isUncostedRecord(rec: RunRecord): boolean {\n return rec.tokenUsage.output > 0 && rec.costUsd === 0\n}\n\n/**\n * Inspect a batch of RunRecords and return an integrity report. Pure\n * function — no I/O, no logging. The caller decides what to do with the\n * verdict (print warning, throw, gate CI, etc.).\n */\nexport function summarizeBackendIntegrity(\n records: ReadonlyArray<RunRecord>,\n): BackendIntegrityReport {\n const totalRecords = records.length\n let stubRecords = 0\n let realRecords = 0\n let uncostedRecords = 0\n let totalInputTokens = 0\n let totalOutputTokens = 0\n let totalCostUsd = 0\n for (const rec of records) {\n totalInputTokens += rec.tokenUsage.input\n totalOutputTokens += rec.tokenUsage.output\n totalCostUsd += rec.costUsd\n if (isStubRecord(rec)) stubRecords++\n else realRecords++\n if (isUncostedRecord(rec)) uncostedRecords++\n }\n const verdict: BackendIntegrityReport['verdict'] =\n totalRecords === 0\n ? 'stub'\n : stubRecords === totalRecords\n ? 'stub'\n : stubRecords === 0\n ? 'real'\n : 'mixed'\n const diagnosis = buildDiagnosis({\n totalRecords,\n stubRecords,\n realRecords,\n uncostedRecords,\n totalInputTokens,\n totalOutputTokens,\n totalCostUsd,\n verdict,\n })\n return {\n totalRecords,\n stubRecords,\n realRecords,\n uncostedRecords,\n totalInputTokens,\n totalOutputTokens,\n totalCostUsd,\n verdict,\n diagnosis,\n }\n}\n\nfunction buildDiagnosis(r: Omit<BackendIntegrityReport, 'diagnosis'>): string {\n if (r.totalRecords === 0) {\n return 'no records — eval produced zero runs; backend likely failed before first turn'\n }\n if (r.verdict === 'stub') {\n return [\n `all ${r.totalRecords} records have zero token usage — the LLM backend was never called.`,\n 'common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;',\n 'auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,',\n 'or boot the cli-bridge / sandbox before invoking the eval.',\n ].join(' ')\n }\n if (r.verdict === 'mixed') {\n const pct = ((r.stubRecords / r.totalRecords) * 100).toFixed(0)\n return [\n `${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage — the backend partially failed.`,\n 'common causes: rate-limit cascade (429s after the first N personas);',\n 'transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures.',\n ].join(' ')\n }\n // verdict === 'real'\n if (r.uncostedRecords > 0) {\n const pct = ((r.uncostedRecords / r.totalRecords) * 100).toFixed(0)\n return [\n `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,\n `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 — cost ledger is mis-wired (no input-token`,\n 'propagation from the runtime stream into RunRecord).',\n ].join(' ')\n }\n return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`\n}\n\n/**\n * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record\n * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`\n * to also reject mixed verdicts (recommended for CI gates).\n *\n * Real backends pass through silently.\n */\nexport function assertRealBackend(\n records: ReadonlyArray<RunRecord>,\n opts: { allowMixed?: boolean } = {},\n): BackendIntegrityReport {\n const report = summarizeBackendIntegrity(records)\n const allowMixed = opts.allowMixed ?? true\n if (report.verdict === 'stub') {\n throw new BackendIntegrityError(\n `backend-integrity: ran against a stub or unconfigured backend — ${report.diagnosis}`,\n report,\n )\n }\n if (!allowMixed && report.verdict === 'mixed') {\n throw new BackendIntegrityError(\n `backend-integrity: partial backend failure rejected — ${report.diagnosis}`,\n report,\n )\n }\n return report\n}\n","/**\n * ContaminationGuard — ensures held-out scenarios don't leak into\n * training/prompt paths, and flags model memorization.\n *\n * Three probes:\n * 1. `checkCanaries(output, scenario)` — if the scenario carries a\n * canary token, it MUST NOT appear in the agent's output.\n * Canaries are strings that are statistically impossible to\n * reconstruct from the scenario description alone — so if they\n * echo back, the model memorized them.\n * 2. `canaryLeakView(store)` — cross-corpus view of every run whose\n * output contained a canary, with the offending scenario + run.\n * 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error\n * on any code path that reads holdout scenarios but doesn't flag\n * `purpose: 'evaluation'`. Keeps engineers honest.\n */\n\nimport type { DatasetScenario } from './dataset'\nimport { llmSpans } from './trace/query'\nimport type { TraceStore } from './trace/store'\n\nexport interface CanaryLeak {\n scenarioId: string\n canary: string\n runId?: string\n evidence: string\n}\n\nexport function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const s of scenarios) {\n if (!s.canary) continue\n if (output.includes(s.canary)) {\n leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) })\n }\n }\n return leaks\n}\n\n/**\n * Behavioral canary: tests the AGENT, not the eval grep.\n *\n * - `forbiddenPattern` PRESENT in output ⇒ AGENT EMITTED BAD CONTENT ⇒ LEAK.\n * - `forbiddenPattern` ABSENT from output ⇒ AGENT HELD THE LINE ⇒ PASS.\n *\n * Use when running known-bad-prompt scenarios against the agent under\n * test and you want to know if the agent misbehaved. The classical\n * {@link checkCanaries} / {@link import('./canary').runCanaries | runCanaries}\n * test whether the eval check fires when the bad output is forced\n * into the eval flow — different question, different answer.\n *\n * Pattern resolution order (first match wins):\n * 1. `scenario.forbiddenPattern` — if it parses as `/body/flags`,\n * treated as a regex; otherwise a literal substring.\n * 2. `scenario.canary` — literal substring fallback so the helper\n * works on existing scenario fixtures.\n *\n * Returns `null` when nothing forbidden was found OR the scenario\n * declared no pattern.\n */\nexport function checkBehavioralCanary(\n output: string,\n scenario: DatasetScenario,\n): CanaryLeak | null {\n const pattern = scenario.forbiddenPattern ?? scenario.canary\n if (!pattern) return null\n const hit = matchForbidden(output, pattern)\n if (!hit) return null\n return {\n scenarioId: scenario.id,\n canary: pattern,\n evidence: excerpt(output, hit),\n }\n}\n\n/**\n * Behavioral canary over many (scenario, output) pairs. Sibling to\n * {@link import('./canary').runCanaries | runCanaries} — same idea\n * (run-many → report) but the question being answered is \"did the\n * AGENT misbehave?\" rather than \"did the EVAL grep fire?\".\n *\n * Returns one `CanaryLeak` per pair where the agent's output\n * contained its scenario's `forbiddenPattern` (or `canary` fallback).\n */\nexport function runBehavioralCanaries(\n cases: Array<{ scenario: DatasetScenario; output: string; runId?: string }>,\n): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const c of cases) {\n const leak = checkBehavioralCanary(c.output, c.scenario)\n if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId })\n }\n return leaks\n}\n\n/**\n * Resolve a forbidden-pattern string to the matched substring inside\n * `output`. `/body/flags` notation is interpreted as a regex; anything\n * else is a literal substring.\n */\nfunction matchForbidden(output: string, pattern: string): string | null {\n const re = tryParseRegex(pattern)\n if (re) {\n const m = output.match(re)\n return m && m[0].length > 0 ? m[0] : null\n }\n return output.includes(pattern) ? pattern : null\n}\n\nfunction tryParseRegex(pattern: string): RegExp | null {\n if (pattern.length < 2 || pattern[0] !== '/') return null\n const last = pattern.lastIndexOf('/')\n if (last <= 0) return null\n const body = pattern.slice(1, last)\n const flags = pattern.slice(last + 1)\n if (!/^[gimsuy]*$/.test(flags)) return null\n try {\n return new RegExp(body, flags)\n } catch {\n return null\n }\n}\n\n/**\n * Scan the LLM-output history in a corpus; returns every case where a\n * canary from a known scenario appeared in agent output. Pass the full\n * set of scenarios whose canaries you care about (typically the whole\n * held-out slice).\n */\nexport async function canaryLeakView(\n store: TraceStore,\n scenarios: DatasetScenario[],\n): Promise<CanaryLeak[]> {\n const targets = scenarios.filter((s) => !!s.canary)\n if (targets.length === 0) return []\n const spans = await llmSpans(store)\n const leaks: CanaryLeak[] = []\n for (const span of spans) {\n const output = span.output ?? ''\n for (const s of targets) {\n if (s.canary && output.includes(s.canary)) {\n leaks.push({\n scenarioId: s.id,\n canary: s.canary,\n runId: span.runId,\n evidence: excerpt(output, s.canary),\n })\n }\n }\n }\n return leaks\n}\n\nexport class HoldoutAuditor {\n private scenarios: DatasetScenario[]\n private accessLog: Array<{ scenarioId: string; purpose: string; at: number }> = []\n\n constructor(scenarios: DatasetScenario[]) {\n this.scenarios = scenarios\n }\n\n /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */\n get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario {\n if (purpose !== 'evaluation' && purpose !== 'debugging') {\n throw new Error(\n `HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`,\n )\n }\n const s = this.scenarios.find((x) => x.id === scenarioId)\n if (!s) throw new Error(`holdout scenario \"${scenarioId}\" not found`)\n this.accessLog.push({ scenarioId, purpose, at: Date.now() })\n return s\n }\n\n getAccessLog(): ReadonlyArray<{ scenarioId: string; purpose: string; at: number }> {\n return this.accessLog\n }\n}\n\nfunction excerpt(source: string, needle: string): string {\n const at = source.indexOf(needle)\n if (at < 0) return ''\n const start = Math.max(0, at - 30)\n const end = Math.min(source.length, at + needle.length + 30)\n return (start > 0 ? '…' : '') + source.slice(start, end) + (end < source.length ? '…' : '')\n}\n"],"mappings":";;;;;;;;AAqDO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YACE,SACgB,QAChB;AACA,UAAM,qBAAqB,OAAO;AAFlB;AAAA,EAGlB;AAAA,EAHkB;AAIpB;AAEA,SAAS,aAAa,KAAyB;AAC7C,SAAO,IAAI,WAAW,UAAU,KAAK,IAAI,WAAW,WAAW;AACjE;AAEA,SAAS,iBAAiB,KAAyB;AACjD,SAAO,IAAI,WAAW,SAAS,KAAK,IAAI,YAAY;AACtD;AAOO,SAAS,0BACd,SACwB;AACxB,QAAM,eAAe,QAAQ;AAC7B,MAAI,cAAc;AAClB,MAAI,cAAc;AAClB,MAAI,kBAAkB;AACtB,MAAI,mBAAmB;AACvB,MAAI,oBAAoB;AACxB,MAAI,eAAe;AACnB,aAAW,OAAO,SAAS;AACzB,wBAAoB,IAAI,WAAW;AACnC,yBAAqB,IAAI,WAAW;AACpC,oBAAgB,IAAI;AACpB,QAAI,aAAa,GAAG,EAAG;AAAA,QAClB;AACL,QAAI,iBAAiB,GAAG,EAAG;AAAA,EAC7B;AACA,QAAM,UACJ,iBAAiB,IACb,SACA,gBAAgB,eACd,SACA,gBAAgB,IACd,SACA;AACV,QAAM,YAAY,eAAe;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEA,SAAS,eAAe,GAAsD;AAC5E,MAAI,EAAE,iBAAiB,GAAG;AACxB,WAAO;AAAA,EACT;AACA,MAAI,EAAE,YAAY,QAAQ;AACxB,WAAO;AAAA,MACL,OAAO,EAAE,YAAY;AAAA,MACrB;AAAA,MACA;AAAA,MACA;AAAA,IACF,EAAE,KAAK,GAAG;AAAA,EACZ;AACA,MAAI,EAAE,YAAY,SAAS;AACzB,UAAM,OAAQ,EAAE,cAAc,EAAE,eAAgB,KAAK,QAAQ,CAAC;AAC9D,WAAO;AAAA,MACL,GAAG,EAAE,WAAW,IAAI,EAAE,YAAY,aAAa,GAAG;AAAA,MAClD;AAAA,MACA;AAAA,IACF,EAAE,KAAK,GAAG;AAAA,EACZ;AAEA,MAAI,EAAE,kBAAkB,GAAG;AACzB,UAAM,OAAQ,EAAE,kBAAkB,EAAE,eAAgB,KAAK,QAAQ,CAAC;AAClE,WAAO;AAAA,MACL,GAAG,EAAE,YAAY,uCAAuC,EAAE,gBAAgB,SAAS,EAAE,iBAAiB;AAAA,MACtG,GAAG,EAAE,eAAe,KAAK,GAAG;AAAA,MAC5B;AAAA,IACF,EAAE,KAAK,GAAG;AAAA,EACZ;AACA,SAAO,GAAG,EAAE,YAAY,uCAAuC,EAAE,gBAAgB,SAAS,EAAE,iBAAiB,aAAa,EAAE,aAAa,QAAQ,CAAC,CAAC;AACrJ;AASO,SAAS,kBACd,SACA,OAAiC,CAAC,GACV;AACxB,QAAM,SAAS,0BAA0B,OAAO;AAChD,QAAM,aAAa,KAAK,cAAc;AACtC,MAAI,OAAO,YAAY,QAAQ;AAC7B,UAAM,IAAI;AAAA,MACR,wEAAmE,OAAO,SAAS;AAAA,MACnF;AAAA,IACF;AAAA,EACF;AACA,MAAI,CAAC,cAAc,OAAO,YAAY,SAAS;AAC7C,UAAM,IAAI;AAAA,MACR,8DAAyD,OAAO,SAAS;AAAA,MACzE;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;;;AC1JO,SAAS,cAAc,QAAgB,WAA4C;AACxF,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,WAAW;AACzB,QAAI,CAAC,EAAE,OAAQ;AACf,QAAI,OAAO,SAAS,EAAE,MAAM,GAAG;AAC7B,YAAM,KAAK,EAAE,YAAY,EAAE,IAAI,QAAQ,EAAE,QAAQ,UAAU,QAAQ,QAAQ,EAAE,MAAM,EAAE,CAAC;AAAA,IACxF;AAAA,EACF;AACA,SAAO;AACT;AAuBO,SAAS,sBACd,QACA,UACmB;AACnB,QAAM,UAAU,SAAS,oBAAoB,SAAS;AACtD,MAAI,CAAC,QAAS,QAAO;AACrB,QAAM,MAAM,eAAe,QAAQ,OAAO;AAC1C,MAAI,CAAC,IAAK,QAAO;AACjB,SAAO;AAAA,IACL,YAAY,SAAS;AAAA,IACrB,QAAQ;AAAA,IACR,UAAU,QAAQ,QAAQ,GAAG;AAAA,EAC/B;AACF;AAWO,SAAS,sBACd,OACc;AACd,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,OAAO;AACrB,UAAM,OAAO,sBAAsB,EAAE,QAAQ,EAAE,QAAQ;AACvD,QAAI,KAAM,OAAM,KAAK,EAAE,GAAG,MAAM,OAAO,EAAE,SAAS,KAAK,MAAM,CAAC;AAAA,EAChE;AACA,SAAO;AACT;AAOA,SAAS,eAAe,QAAgB,SAAgC;AACtE,QAAM,KAAK,cAAc,OAAO;AAChC,MAAI,IAAI;AACN,UAAM,IAAI,OAAO,MAAM,EAAE;AACzB,WAAO,KAAK,EAAE,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC,IAAI;AAAA,EACvC;AACA,SAAO,OAAO,SAAS,OAAO,IAAI,UAAU;AAC9C;AAEA,SAAS,cAAc,SAAgC;AACrD,MAAI,QAAQ,SAAS,KAAK,QAAQ,CAAC,MAAM,IAAK,QAAO;AACrD,QAAM,OAAO,QAAQ,YAAY,GAAG;AACpC,MAAI,QAAQ,EAAG,QAAO;AACtB,QAAM,OAAO,QAAQ,MAAM,GAAG,IAAI;AAClC,QAAM,QAAQ,QAAQ,MAAM,OAAO,CAAC;AACpC,MAAI,CAAC,cAAc,KAAK,KAAK,EAAG,QAAO;AACvC,MAAI;AACF,WAAO,IAAI,OAAO,MAAM,KAAK;AAAA,EAC/B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQA,eAAsB,eACpB,OACA,WACuB;AACvB,QAAM,UAAU,UAAU,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,MAAM;AAClD,MAAI,QAAQ,WAAW,EAAG,QAAO,CAAC;AAClC,QAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,QAAM,QAAsB,CAAC;AAC7B,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,UAAU;AAC9B,eAAW,KAAK,SAAS;AACvB,UAAI,EAAE,UAAU,OAAO,SAAS,EAAE,MAAM,GAAG;AACzC,cAAM,KAAK;AAAA,UACT,YAAY,EAAE;AAAA,UACd,QAAQ,EAAE;AAAA,UACV,OAAO,KAAK;AAAA,UACZ,UAAU,QAAQ,QAAQ,EAAE,MAAM;AAAA,QACpC,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAEO,IAAM,iBAAN,MAAqB;AAAA,EAClB;AAAA,EACA,YAAwE,CAAC;AAAA,EAEjF,YAAY,WAA8B;AACxC,SAAK,YAAY;AAAA,EACnB;AAAA;AAAA,EAGA,IAAI,YAAoB,SAAsD;AAC5E,QAAI,YAAY,gBAAgB,YAAY,aAAa;AACvD,YAAM,IAAI;AAAA,QACR,wEAAwE,OAAO;AAAA,MACjF;AAAA,IACF;AACA,UAAM,IAAI,KAAK,UAAU,KAAK,CAAC,MAAM,EAAE,OAAO,UAAU;AACxD,QAAI,CAAC,EAAG,OAAM,IAAI,MAAM,qBAAqB,UAAU,aAAa;AACpE,SAAK,UAAU,KAAK,EAAE,YAAY,SAAS,IAAI,KAAK,IAAI,EAAE,CAAC;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,eAAmF;AACjF,WAAO,KAAK;AAAA,EACd;AACF;AAEA,SAAS,QAAQ,QAAgB,QAAwB;AACvD,QAAM,KAAK,OAAO,QAAQ,MAAM;AAChC,MAAI,KAAK,EAAG,QAAO;AACnB,QAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,EAAE;AACjC,QAAM,MAAM,KAAK,IAAI,OAAO,QAAQ,KAAK,OAAO,SAAS,EAAE;AAC3D,UAAQ,QAAQ,IAAI,WAAM,MAAM,OAAO,MAAM,OAAO,GAAG,KAAK,MAAM,OAAO,SAAS,WAAM;AAC1F;","names":[]}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|