@tangle-network/agent-eval 0.52.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +7 -6
- package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/builder-eval/index.d.ts +4 -3
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +33 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
- package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
- package/dist/chunk-NCK5QLGT.js.map +1 -0
- package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
- package/dist/contract/index.d.ts +25 -12
- package/dist/contract/index.js +171 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
- package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
- package/dist/control.d.ts +7 -6
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
- package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
- package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -2
- package/dist/hosted/index.d.ts +7 -6
- package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
- package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
- package/dist/index.d.ts +31 -29
- package/dist/index.js +3 -3
- package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
- package/dist/knowledge/index.d.ts +4 -3
- package/dist/meta-eval/index.d.ts +4 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.d.ts +7 -6
- package/dist/prm/index.d.ts +5 -4
- package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
- package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
- package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
- package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
- package/dist/reporting.d.ts +7 -6
- package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
- package/dist/rl.d.ts +11 -10
- package/dist/rl.js +2 -2
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
- package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
- package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
- package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
- package/dist/store-CKUAgsJz.d.ts +101 -0
- package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
- package/dist/traces.d.ts +7 -6
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
- package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
- package/dist/wire/index.d.ts +5 -4
- package/docs/design/self-improvement-protocol.md +223 -0
- package/docs/pilot/README.md +62 -0
- package/docs/pilot/customer-checklist.md +90 -0
- package/docs/pilot/integration-foreign-stack.md +296 -0
- package/docs/pilot/integration-tangle-stack.md +248 -0
- package/docs/pilot/one-pager.md +161 -0
- package/docs/pilot/sample-insight-report.json +172 -0
- package/docs/research/research-roadmap.md +204 -0
- package/package.json +1 -1
- package/dist/chunk-BWZEGTES.js.map +0 -1
- /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
- /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
package/dist/contract/index.js
CHANGED
|
@@ -75,17 +75,21 @@ async function analyzeRuns(opts) {
|
|
|
75
75
|
const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
|
|
76
76
|
const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
|
|
77
77
|
const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
|
|
78
|
+
const failureModes = computeFailureModes(runs);
|
|
78
79
|
const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
|
|
79
80
|
const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
|
|
80
81
|
const release = buildReleaseScorecard(composite, lift, contamination);
|
|
82
|
+
const priorPeriodComparison = opts.baselineRuns ? computePriorPeriodComparison(runs, opts.baselineRuns, split, opts.baselineLabel) : void 0;
|
|
81
83
|
const recommendations = buildRecommendations({
|
|
82
84
|
composite,
|
|
83
85
|
judges,
|
|
84
86
|
interRater,
|
|
85
87
|
lift,
|
|
86
88
|
failureClusters,
|
|
89
|
+
failureModes,
|
|
87
90
|
contamination,
|
|
88
91
|
outcomeCorrelation,
|
|
92
|
+
priorPeriodComparison,
|
|
89
93
|
threshold
|
|
90
94
|
});
|
|
91
95
|
return {
|
|
@@ -100,9 +104,138 @@ async function analyzeRuns(opts) {
|
|
|
100
104
|
contamination,
|
|
101
105
|
outcomeCorrelation,
|
|
102
106
|
release,
|
|
107
|
+
...failureModes ? { failureModes } : {},
|
|
108
|
+
...priorPeriodComparison ? { priorPeriodComparison } : {},
|
|
103
109
|
recommendations
|
|
104
110
|
};
|
|
105
111
|
}
|
|
112
|
+
function computeFailureModes(runs) {
|
|
113
|
+
const counts = /* @__PURE__ */ new Map();
|
|
114
|
+
for (const r of runs) {
|
|
115
|
+
const key = r.failureClass ?? r.failureMode;
|
|
116
|
+
if (key) counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
117
|
+
}
|
|
118
|
+
if (counts.size === 0) return void 0;
|
|
119
|
+
const n = runs.length;
|
|
120
|
+
return [...counts.entries()].map(([mode, count]) => ({ mode, count, share: n > 0 ? count / n : 0 })).sort((a, b) => b.count - a.count || a.mode.localeCompare(b.mode));
|
|
121
|
+
}
|
|
122
|
+
function computePriorPeriodComparison(current, baseline, split, windowLabel) {
|
|
123
|
+
if (current.length === 0 || baseline.length === 0) return void 0;
|
|
124
|
+
const metrics = {};
|
|
125
|
+
const directions = {};
|
|
126
|
+
const compositeCurrent = current.map((r) => compositeOf(r, split)).filter(Number.isFinite);
|
|
127
|
+
const compositeBaseline = baseline.map((r) => compositeOf(r, split)).filter(Number.isFinite);
|
|
128
|
+
if (compositeCurrent.length > 0 && compositeBaseline.length > 0) {
|
|
129
|
+
metrics.composite = welchCompare(compositeBaseline, compositeCurrent);
|
|
130
|
+
directions.composite = "higher-is-better";
|
|
131
|
+
}
|
|
132
|
+
const costCurrent = current.map((r) => r.costUsd).filter(Number.isFinite);
|
|
133
|
+
const costBaseline = baseline.map((r) => r.costUsd).filter(Number.isFinite);
|
|
134
|
+
if (costCurrent.length > 0 && costBaseline.length > 0) {
|
|
135
|
+
metrics.cost = welchCompare(costBaseline, costCurrent);
|
|
136
|
+
directions.cost = "lower-is-better";
|
|
137
|
+
}
|
|
138
|
+
const durCurrent = current.map((r) => r.wallMs).filter(Number.isFinite);
|
|
139
|
+
const durBaseline = baseline.map((r) => r.wallMs).filter(Number.isFinite);
|
|
140
|
+
if (durCurrent.length > 0 && durBaseline.length > 0) {
|
|
141
|
+
metrics.duration = welchCompare(durBaseline, durCurrent);
|
|
142
|
+
directions.duration = "lower-is-better";
|
|
143
|
+
}
|
|
144
|
+
const tokCurrent = current.map((r) => (r.tokenUsage.input ?? 0) + (r.tokenUsage.output ?? 0)).filter(Number.isFinite);
|
|
145
|
+
const tokBaseline = baseline.map((r) => (r.tokenUsage.input ?? 0) + (r.tokenUsage.output ?? 0)).filter(Number.isFinite);
|
|
146
|
+
if (tokCurrent.length > 0 && tokBaseline.length > 0) {
|
|
147
|
+
metrics.tokenUsage = welchCompare(tokBaseline, tokCurrent);
|
|
148
|
+
directions.tokenUsage = "lower-is-better";
|
|
149
|
+
}
|
|
150
|
+
const dimsCurrent = collectPerDimension(current);
|
|
151
|
+
const dimsBaseline = collectPerDimension(baseline);
|
|
152
|
+
for (const dim of Object.keys(dimsCurrent)) {
|
|
153
|
+
const b = dimsBaseline[dim];
|
|
154
|
+
const c = dimsCurrent[dim];
|
|
155
|
+
if (!b || b.length === 0 || !c || c.length === 0) continue;
|
|
156
|
+
metrics[`dim.${dim}`] = welchCompare(b, c);
|
|
157
|
+
directions[`dim.${dim}`] = "higher-is-better";
|
|
158
|
+
}
|
|
159
|
+
const regressedMetrics = [];
|
|
160
|
+
const improvedMetrics = [];
|
|
161
|
+
for (const [name, delta] of Object.entries(metrics)) {
|
|
162
|
+
if (!delta.significant) continue;
|
|
163
|
+
const dir = directions[name] ?? "higher-is-better";
|
|
164
|
+
const better = dir === "higher-is-better" ? delta.delta > 0 : delta.delta < 0;
|
|
165
|
+
if (better) improvedMetrics.push(name);
|
|
166
|
+
else regressedMetrics.push(name);
|
|
167
|
+
}
|
|
168
|
+
return {
|
|
169
|
+
baselineN: baseline.length,
|
|
170
|
+
currentN: current.length,
|
|
171
|
+
...windowLabel ? { windowLabel } : {},
|
|
172
|
+
metrics,
|
|
173
|
+
regressedMetrics,
|
|
174
|
+
improvedMetrics
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
function collectPerDimension(runs) {
|
|
178
|
+
const out = {};
|
|
179
|
+
for (const r of runs) {
|
|
180
|
+
const perDim = r.outcome.judgeScores?.perDimMean;
|
|
181
|
+
if (!perDim) continue;
|
|
182
|
+
for (const [dim, value] of Object.entries(perDim)) {
|
|
183
|
+
if (!Number.isFinite(value)) continue;
|
|
184
|
+
if (!out[dim]) out[dim] = [];
|
|
185
|
+
out[dim].push(value);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return out;
|
|
189
|
+
}
|
|
190
|
+
function welchCompare(baseline, current) {
|
|
191
|
+
const baselineMean = mean(baseline);
|
|
192
|
+
const currentMean = mean(current);
|
|
193
|
+
const baselineVar = sampleVariance(baseline, baselineMean);
|
|
194
|
+
const currentVar = sampleVariance(current, currentMean);
|
|
195
|
+
const baselineN = baseline.length;
|
|
196
|
+
const currentN = current.length;
|
|
197
|
+
const delta = currentMean - baselineMean;
|
|
198
|
+
const se = Math.sqrt(baselineVar / baselineN + currentVar / currentN);
|
|
199
|
+
const halfWidth = 1.96 * (se > 0 ? se : 0);
|
|
200
|
+
const ci95 = [delta - halfWidth, delta + halfWidth];
|
|
201
|
+
const t = se > 0 ? delta / se : 0;
|
|
202
|
+
const pValue = se > 0 ? 2 * (1 - standardNormalCdf(Math.abs(t))) : 1;
|
|
203
|
+
const pooledStddev = Math.sqrt(
|
|
204
|
+
((baselineN - 1) * baselineVar + (currentN - 1) * currentVar) / Math.max(1, baselineN + currentN - 2)
|
|
205
|
+
);
|
|
206
|
+
const cohensD2 = pooledStddev > 0 ? delta / pooledStddev : 0;
|
|
207
|
+
const significant = pValue < 0.05 && Math.abs(cohensD2) >= 0.2;
|
|
208
|
+
return {
|
|
209
|
+
current: currentMean,
|
|
210
|
+
baseline: baselineMean,
|
|
211
|
+
delta,
|
|
212
|
+
ci95,
|
|
213
|
+
pValue,
|
|
214
|
+
cohensD: cohensD2,
|
|
215
|
+
baselineN,
|
|
216
|
+
currentN,
|
|
217
|
+
significant
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
function sampleVariance(xs, xsMean) {
|
|
221
|
+
if (xs.length < 2) return 0;
|
|
222
|
+
let s = 0;
|
|
223
|
+
for (const x of xs) s += (x - xsMean) ** 2;
|
|
224
|
+
return s / (xs.length - 1);
|
|
225
|
+
}
|
|
226
|
+
function standardNormalCdf(z) {
|
|
227
|
+
const a1 = 0.254829592;
|
|
228
|
+
const a2 = -0.284496736;
|
|
229
|
+
const a3 = 1.421413741;
|
|
230
|
+
const a4 = -1.453152027;
|
|
231
|
+
const a5 = 1.061405429;
|
|
232
|
+
const p = 0.3275911;
|
|
233
|
+
const sign = z < 0 ? -1 : 1;
|
|
234
|
+
const x = Math.abs(z) / Math.SQRT2;
|
|
235
|
+
const t = 1 / (1 + p * x);
|
|
236
|
+
const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
|
|
237
|
+
return 0.5 * (1 + sign * y);
|
|
238
|
+
}
|
|
106
239
|
function resolveSplit(runs, pref) {
|
|
107
240
|
if (pref !== "auto") return pref;
|
|
108
241
|
const hasHoldout = runs.some((r) => Number.isFinite(r.outcome.holdoutScore));
|
|
@@ -492,6 +625,32 @@ function buildReleaseScorecard(composite, lift, contamination) {
|
|
|
492
625
|
}
|
|
493
626
|
function buildRecommendations(ctx) {
|
|
494
627
|
const out = [];
|
|
628
|
+
if (ctx.priorPeriodComparison) {
|
|
629
|
+
const ppc = ctx.priorPeriodComparison;
|
|
630
|
+
const label = ppc.windowLabel ?? "baseline period";
|
|
631
|
+
for (const name of ppc.regressedMetrics) {
|
|
632
|
+
const d = ppc.metrics[name];
|
|
633
|
+
if (!d) continue;
|
|
634
|
+
out.push({
|
|
635
|
+
priority: "critical",
|
|
636
|
+
kind: "investigate",
|
|
637
|
+
title: `${name} regressed from ${d.baseline.toFixed(3)} \u2192 ${d.current.toFixed(3)} vs ${label}`,
|
|
638
|
+
detail: `Welch CI95 = [${d.ci95[0].toFixed(3)}, ${d.ci95[1].toFixed(3)}], p=${d.pValue.toFixed(4)}, Cohen's d=${d.cohensD.toFixed(2)} (n_current=${d.currentN}, n_baseline=${d.baselineN}). The regression is statistically significant at p<0.05 with at-least-small effect size.`,
|
|
639
|
+
evidencePath: `priorPeriodComparison.metrics.${name}`
|
|
640
|
+
});
|
|
641
|
+
}
|
|
642
|
+
for (const name of ppc.improvedMetrics) {
|
|
643
|
+
const d = ppc.metrics[name];
|
|
644
|
+
if (!d) continue;
|
|
645
|
+
out.push({
|
|
646
|
+
priority: "low",
|
|
647
|
+
kind: "ship",
|
|
648
|
+
title: `${name} improved from ${d.baseline.toFixed(3)} \u2192 ${d.current.toFixed(3)} vs ${label}`,
|
|
649
|
+
detail: `Welch CI95 = [${d.ci95[0].toFixed(3)}, ${d.ci95[1].toFixed(3)}], p=${d.pValue.toFixed(4)}, Cohen's d=${d.cohensD.toFixed(2)} (n_current=${d.currentN}, n_baseline=${d.baselineN}). Statistically significant improvement worth flagging.`,
|
|
650
|
+
evidencePath: `priorPeriodComparison.metrics.${name}`
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
}
|
|
495
654
|
if (ctx.composite.n > 0) {
|
|
496
655
|
if (ctx.composite.mean < 0.3) {
|
|
497
656
|
const tail = ctx.composite.tailRuns ?? [];
|
|
@@ -515,6 +674,18 @@ function buildRecommendations(ctx) {
|
|
|
515
674
|
});
|
|
516
675
|
}
|
|
517
676
|
}
|
|
677
|
+
if (ctx.failureModes && ctx.failureModes.length > 0) {
|
|
678
|
+
const top = ctx.failureModes[0];
|
|
679
|
+
if (top.count >= 3 && top.share >= 0.15) {
|
|
680
|
+
out.push({
|
|
681
|
+
priority: top.share >= 0.25 ? "high" : "medium",
|
|
682
|
+
kind: "investigate",
|
|
683
|
+
title: `'${top.mode}' is the dominant failure mode \u2014 ${top.count} runs (${(top.share * 100).toFixed(0)}% of the corpus)`,
|
|
684
|
+
detail: `The mean composite can look acceptable while one named failure dominates the lower tail. ${top.count} of ${ctx.composite.n} runs failed with '${top.mode}'${ctx.failureModes.length > 1 ? ` (next: '${ctx.failureModes[1].mode}' \xD7${ctx.failureModes[1].count})` : ""}. Fix this cause first.`,
|
|
685
|
+
evidencePath: "failureModes"
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
}
|
|
518
689
|
if (Object.keys(ctx.judges).length === 0 && ctx.composite.n > 0) {
|
|
519
690
|
out.push({
|
|
520
691
|
priority: "medium",
|