@tangle-network/agent-eval 0.49.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +8 -2
- package/dist/campaign/index.d.ts +3 -3
- package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
- package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
- package/dist/chunk-EGIPWXHL.js.map +1 -0
- package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
- package/dist/chunk-FQK2CCIM.js.map +1 -0
- package/dist/chunk-MAZ26DC7.js +99 -0
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
- package/dist/contract/index.d.ts +206 -9
- package/dist/contract/index.js +751 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +8 -192
- package/dist/hosted/index.js +1 -1
- package/dist/index-BRxz6qov.d.ts +409 -0
- package/dist/index.d.ts +18 -462
- package/dist/index.js +14 -106
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
- package/dist/registry-8KAs18kY.d.ts +457 -0
- package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +6 -4
- package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
- package/dist/rl.d.ts +9 -8
- package/dist/rl.js +3 -2
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
- package/dist/sequential-5iSVfzl2.d.ts +139 -0
- package/dist/store-CJbzDxZ2.d.ts +220 -0
- package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
- package/dist/traces.d.ts +3 -220
- package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
- package/dist/types-DhqpAi_z.d.ts +296 -0
- package/package.json +1 -1
- package/dist/chunk-MNL6LXGQ.js.map +0 -1
- package/dist/chunk-OYI6RZJK.js.map +0 -1
- /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
- /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
import {
|
|
2
|
+
HoldoutAuditor,
|
|
3
|
+
canaryLeakView,
|
|
4
|
+
checkBehavioralCanary,
|
|
5
|
+
checkCanaries,
|
|
6
|
+
runBehavioralCanaries
|
|
7
|
+
} from "./chunk-SHTXZ4O2.js";
|
|
1
8
|
import {
|
|
2
9
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
3
10
|
DEFAULT_RED_TEAM_CORPUS,
|
|
@@ -82,10 +89,10 @@ import {
|
|
|
82
89
|
evaluateReleaseConfidence,
|
|
83
90
|
judgeReplayGate,
|
|
84
91
|
renderReleaseReport
|
|
85
|
-
} from "./chunk-
|
|
92
|
+
} from "./chunk-UBQGWD3O.js";
|
|
86
93
|
import {
|
|
87
94
|
runEvalCampaign
|
|
88
|
-
} from "./chunk-
|
|
95
|
+
} from "./chunk-5KSDYBYH.js";
|
|
89
96
|
import {
|
|
90
97
|
AGENT_PROFILE_KINDS,
|
|
91
98
|
AgentProfileCellValidationError,
|
|
@@ -106,14 +113,16 @@ import {
|
|
|
106
113
|
verifyAgentProfileCell
|
|
107
114
|
} from "./chunk-BWZEGTES.js";
|
|
108
115
|
import {
|
|
109
|
-
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
110
116
|
evaluateInterimReleaseConfidence,
|
|
117
|
+
pairedEvalueSequence
|
|
118
|
+
} from "./chunk-MAZ26DC7.js";
|
|
119
|
+
import {
|
|
120
|
+
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
111
121
|
gainHistogram,
|
|
112
|
-
pairedEvalueSequence,
|
|
113
122
|
paretoChart,
|
|
114
123
|
researchReport,
|
|
115
124
|
summaryTable
|
|
116
|
-
} from "./chunk-
|
|
125
|
+
} from "./chunk-EGIPWXHL.js";
|
|
117
126
|
import {
|
|
118
127
|
benjaminiHochberg,
|
|
119
128
|
bonferroni,
|
|
@@ -6938,107 +6947,6 @@ function defaultExtract(metric) {
|
|
|
6938
6947
|
};
|
|
6939
6948
|
}
|
|
6940
6949
|
|
|
6941
|
-
// src/contamination-guard.ts
|
|
6942
|
-
function checkCanaries(output, scenarios) {
|
|
6943
|
-
const leaks = [];
|
|
6944
|
-
for (const s of scenarios) {
|
|
6945
|
-
if (!s.canary) continue;
|
|
6946
|
-
if (output.includes(s.canary)) {
|
|
6947
|
-
leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt2(output, s.canary) });
|
|
6948
|
-
}
|
|
6949
|
-
}
|
|
6950
|
-
return leaks;
|
|
6951
|
-
}
|
|
6952
|
-
function checkBehavioralCanary(output, scenario) {
|
|
6953
|
-
const pattern = scenario.forbiddenPattern ?? scenario.canary;
|
|
6954
|
-
if (!pattern) return null;
|
|
6955
|
-
const hit = matchForbidden(output, pattern);
|
|
6956
|
-
if (!hit) return null;
|
|
6957
|
-
return {
|
|
6958
|
-
scenarioId: scenario.id,
|
|
6959
|
-
canary: pattern,
|
|
6960
|
-
evidence: excerpt2(output, hit)
|
|
6961
|
-
};
|
|
6962
|
-
}
|
|
6963
|
-
function runBehavioralCanaries(cases) {
|
|
6964
|
-
const leaks = [];
|
|
6965
|
-
for (const c of cases) {
|
|
6966
|
-
const leak = checkBehavioralCanary(c.output, c.scenario);
|
|
6967
|
-
if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
|
|
6968
|
-
}
|
|
6969
|
-
return leaks;
|
|
6970
|
-
}
|
|
6971
|
-
function matchForbidden(output, pattern) {
|
|
6972
|
-
const re = tryParseRegex(pattern);
|
|
6973
|
-
if (re) {
|
|
6974
|
-
const m = output.match(re);
|
|
6975
|
-
return m && m[0].length > 0 ? m[0] : null;
|
|
6976
|
-
}
|
|
6977
|
-
return output.includes(pattern) ? pattern : null;
|
|
6978
|
-
}
|
|
6979
|
-
function tryParseRegex(pattern) {
|
|
6980
|
-
if (pattern.length < 2 || pattern[0] !== "/") return null;
|
|
6981
|
-
const last = pattern.lastIndexOf("/");
|
|
6982
|
-
if (last <= 0) return null;
|
|
6983
|
-
const body = pattern.slice(1, last);
|
|
6984
|
-
const flags = pattern.slice(last + 1);
|
|
6985
|
-
if (!/^[gimsuy]*$/.test(flags)) return null;
|
|
6986
|
-
try {
|
|
6987
|
-
return new RegExp(body, flags);
|
|
6988
|
-
} catch {
|
|
6989
|
-
return null;
|
|
6990
|
-
}
|
|
6991
|
-
}
|
|
6992
|
-
async function canaryLeakView(store, scenarios) {
|
|
6993
|
-
const targets = scenarios.filter((s) => !!s.canary);
|
|
6994
|
-
if (targets.length === 0) return [];
|
|
6995
|
-
const spans = await llmSpans(store);
|
|
6996
|
-
const leaks = [];
|
|
6997
|
-
for (const span of spans) {
|
|
6998
|
-
const output = span.output ?? "";
|
|
6999
|
-
for (const s of targets) {
|
|
7000
|
-
if (s.canary && output.includes(s.canary)) {
|
|
7001
|
-
leaks.push({
|
|
7002
|
-
scenarioId: s.id,
|
|
7003
|
-
canary: s.canary,
|
|
7004
|
-
runId: span.runId,
|
|
7005
|
-
evidence: excerpt2(output, s.canary)
|
|
7006
|
-
});
|
|
7007
|
-
}
|
|
7008
|
-
}
|
|
7009
|
-
}
|
|
7010
|
-
return leaks;
|
|
7011
|
-
}
|
|
7012
|
-
var HoldoutAuditor = class {
|
|
7013
|
-
scenarios;
|
|
7014
|
-
accessLog = [];
|
|
7015
|
-
constructor(scenarios) {
|
|
7016
|
-
this.scenarios = scenarios;
|
|
7017
|
-
}
|
|
7018
|
-
/** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
|
|
7019
|
-
get(scenarioId, purpose) {
|
|
7020
|
-
if (purpose !== "evaluation" && purpose !== "debugging") {
|
|
7021
|
-
throw new Error(
|
|
7022
|
-
`HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`
|
|
7023
|
-
);
|
|
7024
|
-
}
|
|
7025
|
-
const s = this.scenarios.find((x) => x.id === scenarioId);
|
|
7026
|
-
if (!s) throw new Error(`holdout scenario "${scenarioId}" not found`);
|
|
7027
|
-
this.accessLog.push({ scenarioId, purpose, at: Date.now() });
|
|
7028
|
-
return s;
|
|
7029
|
-
}
|
|
7030
|
-
getAccessLog() {
|
|
7031
|
-
return this.accessLog;
|
|
7032
|
-
}
|
|
7033
|
-
};
|
|
7034
|
-
function excerpt2(source, needle) {
|
|
7035
|
-
const at = source.indexOf(needle);
|
|
7036
|
-
if (at < 0) return "";
|
|
7037
|
-
const start = Math.max(0, at - 30);
|
|
7038
|
-
const end = Math.min(source.length, at + needle.length + 30);
|
|
7039
|
-
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
7040
|
-
}
|
|
7041
|
-
|
|
7042
6950
|
// src/observability.ts
|
|
7043
6951
|
async function toLangfuseEnvelope(store, runId) {
|
|
7044
6952
|
const run = await store.getRun(runId);
|