@tangle-network/agent-eval 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +7 -6
  5. package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +3 -2
  7. package/dist/builder-eval/index.d.ts +4 -3
  8. package/dist/campaign/index.d.ts +9 -7
  9. package/dist/campaign/index.js +33 -4
  10. package/dist/campaign/index.js.map +1 -1
  11. package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
  12. package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
  13. package/dist/chunk-NCK5QLGT.js.map +1 -0
  14. package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
  15. package/dist/contract/index.d.ts +25 -12
  16. package/dist/contract/index.js +171 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
  19. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
  20. package/dist/control.d.ts +7 -6
  21. package/dist/control.js +2 -2
  22. package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
  23. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
  24. package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
  25. package/dist/governance/index.d.ts +3 -2
  26. package/dist/hosted/index.d.ts +7 -6
  27. package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
  28. package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
  29. package/dist/index.d.ts +31 -29
  30. package/dist/index.js +3 -3
  31. package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
  32. package/dist/knowledge/index.d.ts +4 -3
  33. package/dist/meta-eval/index.d.ts +4 -3
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.d.ts +7 -6
  36. package/dist/prm/index.d.ts +5 -4
  37. package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
  38. package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
  39. package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
  40. package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
  41. package/dist/reporting.d.ts +7 -6
  42. package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
  43. package/dist/rl.d.ts +11 -10
  44. package/dist/rl.js +2 -2
  45. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
  46. package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
  47. package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
  48. package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
  49. package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
  50. package/dist/store-CKUAgsJz.d.ts +101 -0
  51. package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
  52. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
  53. package/dist/traces.d.ts +7 -6
  54. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
  55. package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
  56. package/dist/wire/index.d.ts +5 -4
  57. package/docs/design/self-improvement-protocol.md +223 -0
  58. package/docs/pilot/README.md +62 -0
  59. package/docs/pilot/customer-checklist.md +90 -0
  60. package/docs/pilot/integration-foreign-stack.md +296 -0
  61. package/docs/pilot/integration-tangle-stack.md +248 -0
  62. package/docs/pilot/one-pager.md +161 -0
  63. package/docs/pilot/sample-insight-report.json +172 -0
  64. package/docs/research/research-roadmap.md +204 -0
  65. package/package.json +1 -1
  66. package/dist/chunk-BWZEGTES.js.map +0 -1
  67. /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
  68. /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
@@ -75,17 +75,21 @@ async function analyzeRuns(opts) {
75
75
  const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
76
76
  const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
77
77
  const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
78
+ const failureModes = computeFailureModes(runs);
78
79
  const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
79
80
  const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
80
81
  const release = buildReleaseScorecard(composite, lift, contamination);
82
+ const priorPeriodComparison = opts.baselineRuns ? computePriorPeriodComparison(runs, opts.baselineRuns, split, opts.baselineLabel) : void 0;
81
83
  const recommendations = buildRecommendations({
82
84
  composite,
83
85
  judges,
84
86
  interRater,
85
87
  lift,
86
88
  failureClusters,
89
+ failureModes,
87
90
  contamination,
88
91
  outcomeCorrelation,
92
+ priorPeriodComparison,
89
93
  threshold
90
94
  });
91
95
  return {
@@ -100,9 +104,138 @@ async function analyzeRuns(opts) {
100
104
  contamination,
101
105
  outcomeCorrelation,
102
106
  release,
107
+ ...failureModes ? { failureModes } : {},
108
+ ...priorPeriodComparison ? { priorPeriodComparison } : {},
103
109
  recommendations
104
110
  };
105
111
  }
112
+ function computeFailureModes(runs) {
113
+ const counts = /* @__PURE__ */ new Map();
114
+ for (const r of runs) {
115
+ const key = r.failureClass ?? r.failureMode;
116
+ if (key) counts.set(key, (counts.get(key) ?? 0) + 1);
117
+ }
118
+ if (counts.size === 0) return void 0;
119
+ const n = runs.length;
120
+ return [...counts.entries()].map(([mode, count]) => ({ mode, count, share: n > 0 ? count / n : 0 })).sort((a, b) => b.count - a.count || a.mode.localeCompare(b.mode));
121
+ }
122
+ function computePriorPeriodComparison(current, baseline, split, windowLabel) {
123
+ if (current.length === 0 || baseline.length === 0) return void 0;
124
+ const metrics = {};
125
+ const directions = {};
126
+ const compositeCurrent = current.map((r) => compositeOf(r, split)).filter(Number.isFinite);
127
+ const compositeBaseline = baseline.map((r) => compositeOf(r, split)).filter(Number.isFinite);
128
+ if (compositeCurrent.length > 0 && compositeBaseline.length > 0) {
129
+ metrics.composite = welchCompare(compositeBaseline, compositeCurrent);
130
+ directions.composite = "higher-is-better";
131
+ }
132
+ const costCurrent = current.map((r) => r.costUsd).filter(Number.isFinite);
133
+ const costBaseline = baseline.map((r) => r.costUsd).filter(Number.isFinite);
134
+ if (costCurrent.length > 0 && costBaseline.length > 0) {
135
+ metrics.cost = welchCompare(costBaseline, costCurrent);
136
+ directions.cost = "lower-is-better";
137
+ }
138
+ const durCurrent = current.map((r) => r.wallMs).filter(Number.isFinite);
139
+ const durBaseline = baseline.map((r) => r.wallMs).filter(Number.isFinite);
140
+ if (durCurrent.length > 0 && durBaseline.length > 0) {
141
+ metrics.duration = welchCompare(durBaseline, durCurrent);
142
+ directions.duration = "lower-is-better";
143
+ }
144
+ const tokCurrent = current.map((r) => (r.tokenUsage.input ?? 0) + (r.tokenUsage.output ?? 0)).filter(Number.isFinite);
145
+ const tokBaseline = baseline.map((r) => (r.tokenUsage.input ?? 0) + (r.tokenUsage.output ?? 0)).filter(Number.isFinite);
146
+ if (tokCurrent.length > 0 && tokBaseline.length > 0) {
147
+ metrics.tokenUsage = welchCompare(tokBaseline, tokCurrent);
148
+ directions.tokenUsage = "lower-is-better";
149
+ }
150
+ const dimsCurrent = collectPerDimension(current);
151
+ const dimsBaseline = collectPerDimension(baseline);
152
+ for (const dim of Object.keys(dimsCurrent)) {
153
+ const b = dimsBaseline[dim];
154
+ const c = dimsCurrent[dim];
155
+ if (!b || b.length === 0 || !c || c.length === 0) continue;
156
+ metrics[`dim.${dim}`] = welchCompare(b, c);
157
+ directions[`dim.${dim}`] = "higher-is-better";
158
+ }
159
+ const regressedMetrics = [];
160
+ const improvedMetrics = [];
161
+ for (const [name, delta] of Object.entries(metrics)) {
162
+ if (!delta.significant) continue;
163
+ const dir = directions[name] ?? "higher-is-better";
164
+ const better = dir === "higher-is-better" ? delta.delta > 0 : delta.delta < 0;
165
+ if (better) improvedMetrics.push(name);
166
+ else regressedMetrics.push(name);
167
+ }
168
+ return {
169
+ baselineN: baseline.length,
170
+ currentN: current.length,
171
+ ...windowLabel ? { windowLabel } : {},
172
+ metrics,
173
+ regressedMetrics,
174
+ improvedMetrics
175
+ };
176
+ }
177
+ function collectPerDimension(runs) {
178
+ const out = {};
179
+ for (const r of runs) {
180
+ const perDim = r.outcome.judgeScores?.perDimMean;
181
+ if (!perDim) continue;
182
+ for (const [dim, value] of Object.entries(perDim)) {
183
+ if (!Number.isFinite(value)) continue;
184
+ if (!out[dim]) out[dim] = [];
185
+ out[dim].push(value);
186
+ }
187
+ }
188
+ return out;
189
+ }
190
+ function welchCompare(baseline, current) {
191
+ const baselineMean = mean(baseline);
192
+ const currentMean = mean(current);
193
+ const baselineVar = sampleVariance(baseline, baselineMean);
194
+ const currentVar = sampleVariance(current, currentMean);
195
+ const baselineN = baseline.length;
196
+ const currentN = current.length;
197
+ const delta = currentMean - baselineMean;
198
+ const se = Math.sqrt(baselineVar / baselineN + currentVar / currentN);
199
+ const halfWidth = 1.96 * (se > 0 ? se : 0);
200
+ const ci95 = [delta - halfWidth, delta + halfWidth];
201
+ const t = se > 0 ? delta / se : 0;
202
+ const pValue = se > 0 ? 2 * (1 - standardNormalCdf(Math.abs(t))) : 1;
203
+ const pooledStddev = Math.sqrt(
204
+ ((baselineN - 1) * baselineVar + (currentN - 1) * currentVar) / Math.max(1, baselineN + currentN - 2)
205
+ );
206
+ const cohensD2 = pooledStddev > 0 ? delta / pooledStddev : 0;
207
+ const significant = pValue < 0.05 && Math.abs(cohensD2) >= 0.2;
208
+ return {
209
+ current: currentMean,
210
+ baseline: baselineMean,
211
+ delta,
212
+ ci95,
213
+ pValue,
214
+ cohensD: cohensD2,
215
+ baselineN,
216
+ currentN,
217
+ significant
218
+ };
219
+ }
220
+ function sampleVariance(xs, xsMean) {
221
+ if (xs.length < 2) return 0;
222
+ let s = 0;
223
+ for (const x of xs) s += (x - xsMean) ** 2;
224
+ return s / (xs.length - 1);
225
+ }
226
+ function standardNormalCdf(z) {
227
+ const a1 = 0.254829592;
228
+ const a2 = -0.284496736;
229
+ const a3 = 1.421413741;
230
+ const a4 = -1.453152027;
231
+ const a5 = 1.061405429;
232
+ const p = 0.3275911;
233
+ const sign = z < 0 ? -1 : 1;
234
+ const x = Math.abs(z) / Math.SQRT2;
235
+ const t = 1 / (1 + p * x);
236
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
237
+ return 0.5 * (1 + sign * y);
238
+ }
106
239
  function resolveSplit(runs, pref) {
107
240
  if (pref !== "auto") return pref;
108
241
  const hasHoldout = runs.some((r) => Number.isFinite(r.outcome.holdoutScore));
@@ -492,6 +625,32 @@ function buildReleaseScorecard(composite, lift, contamination) {
492
625
  }
493
626
  function buildRecommendations(ctx) {
494
627
  const out = [];
628
+ if (ctx.priorPeriodComparison) {
629
+ const ppc = ctx.priorPeriodComparison;
630
+ const label = ppc.windowLabel ?? "baseline period";
631
+ for (const name of ppc.regressedMetrics) {
632
+ const d = ppc.metrics[name];
633
+ if (!d) continue;
634
+ out.push({
635
+ priority: "critical",
636
+ kind: "investigate",
637
+ title: `${name} regressed from ${d.baseline.toFixed(3)} \u2192 ${d.current.toFixed(3)} vs ${label}`,
638
+ detail: `Welch CI95 = [${d.ci95[0].toFixed(3)}, ${d.ci95[1].toFixed(3)}], p=${d.pValue.toFixed(4)}, Cohen's d=${d.cohensD.toFixed(2)} (n_current=${d.currentN}, n_baseline=${d.baselineN}). The regression is statistically significant at p<0.05 with at-least-small effect size.`,
639
+ evidencePath: `priorPeriodComparison.metrics.${name}`
640
+ });
641
+ }
642
+ for (const name of ppc.improvedMetrics) {
643
+ const d = ppc.metrics[name];
644
+ if (!d) continue;
645
+ out.push({
646
+ priority: "low",
647
+ kind: "ship",
648
+ title: `${name} improved from ${d.baseline.toFixed(3)} \u2192 ${d.current.toFixed(3)} vs ${label}`,
649
+ detail: `Welch CI95 = [${d.ci95[0].toFixed(3)}, ${d.ci95[1].toFixed(3)}], p=${d.pValue.toFixed(4)}, Cohen's d=${d.cohensD.toFixed(2)} (n_current=${d.currentN}, n_baseline=${d.baselineN}). Statistically significant improvement worth flagging.`,
650
+ evidencePath: `priorPeriodComparison.metrics.${name}`
651
+ });
652
+ }
653
+ }
495
654
  if (ctx.composite.n > 0) {
496
655
  if (ctx.composite.mean < 0.3) {
497
656
  const tail = ctx.composite.tailRuns ?? [];
@@ -515,6 +674,18 @@ function buildRecommendations(ctx) {
515
674
  });
516
675
  }
517
676
  }
677
+ if (ctx.failureModes && ctx.failureModes.length > 0) {
678
+ const top = ctx.failureModes[0];
679
+ if (top.count >= 3 && top.share >= 0.15) {
680
+ out.push({
681
+ priority: top.share >= 0.25 ? "high" : "medium",
682
+ kind: "investigate",
683
+ title: `'${top.mode}' is the dominant failure mode \u2014 ${top.count} runs (${(top.share * 100).toFixed(0)}% of the corpus)`,
684
+ detail: `The mean composite can look acceptable while one named failure dominates the lower tail. ${top.count} of ${ctx.composite.n} runs failed with '${top.mode}'${ctx.failureModes.length > 1 ? ` (next: '${ctx.failureModes[1].mode}' \xD7${ctx.failureModes[1].count})` : ""}. Fix this cause first.`,
685
+ evidencePath: "failureModes"
686
+ });
687
+ }
688
+ }
518
689
  if (Object.keys(ctx.judges).length === 0 && ctx.composite.n > 0) {
519
690
  out.push({
520
691
  priority: "medium",