@tangle-network/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +7 -6
  4. package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
  5. package/dist/benchmarks/index.d.ts +3 -2
  6. package/dist/builder-eval/index.d.ts +4 -3
  7. package/dist/campaign/index.d.ts +9 -7
  8. package/dist/campaign/index.js +33 -4
  9. package/dist/campaign/index.js.map +1 -1
  10. package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
  11. package/dist/{chunk-5KSDYBYH.js → chunk-LYL4SOKT.js} +3 -2
  12. package/dist/chunk-LYL4SOKT.js.map +1 -0
  13. package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
  14. package/dist/chunk-NCK5QLGT.js.map +1 -0
  15. package/dist/contract/index.d.ts +13 -12
  16. package/dist/contract/index.js +25 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
  19. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
  20. package/dist/control.d.ts +7 -6
  21. package/dist/control.js +2 -2
  22. package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
  23. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
  24. package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
  25. package/dist/governance/index.d.ts +3 -2
  26. package/dist/hosted/index.d.ts +7 -6
  27. package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
  28. package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
  29. package/dist/index.d.ts +31 -29
  30. package/dist/index.js +3 -3
  31. package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
  32. package/dist/knowledge/index.d.ts +4 -3
  33. package/dist/meta-eval/index.d.ts +4 -3
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.d.ts +7 -6
  36. package/dist/prm/index.d.ts +5 -4
  37. package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
  38. package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
  39. package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
  40. package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
  41. package/dist/reporting.d.ts +7 -6
  42. package/dist/{researcher-LZD0qHEa.d.ts → researcher-JP8EvnLv.d.ts} +11 -6
  43. package/dist/rl.d.ts +11 -10
  44. package/dist/rl.js +2 -2
  45. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
  46. package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
  47. package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
  48. package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
  49. package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
  50. package/dist/store-CKUAgsJz.d.ts +101 -0
  51. package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
  52. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
  53. package/dist/traces.d.ts +7 -6
  54. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
  55. package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
  56. package/dist/wire/index.d.ts +5 -4
  57. package/docs/pilot/README.md +62 -0
  58. package/docs/pilot/customer-checklist.md +90 -0
  59. package/docs/pilot/integration-foreign-stack.md +296 -0
  60. package/docs/pilot/integration-tangle-stack.md +248 -0
  61. package/docs/pilot/one-pager.md +161 -0
  62. package/docs/pilot/sample-insight-report.json +172 -0
  63. package/docs/research/research-roadmap.md +204 -0
  64. package/package.json +1 -1
  65. package/dist/chunk-5KSDYBYH.js.map +0 -1
  66. package/dist/chunk-BWZEGTES.js.map +0 -1
  67. /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
@@ -1,20 +1,21 @@
1
- import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-Dbj5gu8n.js';
2
- export { f as CampaignAggregates, g as CampaignArtifactWriter, h as CampaignCellResult, i as CampaignCostMeter, j as CampaignResult, k as CampaignTraceWriter, C as CodeSurface, D as Dispatch, l as GateContext, m as GateDecision, n as GateResult, o as GenerationCandidate, p as GenerationRecord, r as JudgeDimension, J as JudgeScore, t as Mutator, O as OptimizerConfig, v as SessionScript } from '../types-Dbj5gu8n.js';
3
- import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-Cc7oZlRP.js';
4
- export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-Cc7oZlRP.js';
1
+ import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-BgrxOJSf.js';
2
+ export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, w as SessionScript } from '../types-BgrxOJSf.js';
3
+ import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-BhfdjrMY.js';
4
+ export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-BhfdjrMY.js';
5
5
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
6
- import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-C7RhhEME.js';
7
- export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-C7RhhEME.js';
8
- import { A as AnalystRegistry } from '../registry-8KAs18kY.js';
6
+ import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-D2nT6_KT.js';
7
+ export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-D2nT6_KT.js';
8
+ import { A as AnalystRegistry } from '../registry-BSWy0rvH.js';
9
9
  import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
10
- import { R as RunRecord, a as RunSplitTag } from '../run-record-BGY6bHRh.js';
10
+ import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
11
11
  import '../llm-client-BXVRUZyX.js';
12
12
  import '../errors-mje_cKOs.js';
13
13
  import '../raw-provider-sink-C46HDghv.js';
14
- import '../red-team-30II1T4o.js';
15
- import '../store-Db2Bv8Cf.js';
16
- import '../summary-report-B7gNRX-r.js';
17
- import '../failure-cluster-Cw65_5FY.js';
14
+ import '../red-team-CrC5MZYd.js';
15
+ import '../store-CKUAgsJz.js';
16
+ import '../schema-m0gsnbt3.js';
17
+ import '../summary-report-DLxh4yWk.js';
18
+ import '../failure-cluster-CL7IVgkJ.js';
18
19
  import '../judge-calibration-DilmB3Ml.js';
19
20
  import '../store-CJbzDxZ2.js';
20
21
  import '../types-DhqpAi_z.js';
@@ -75,6 +75,7 @@ async function analyzeRuns(opts) {
75
75
  const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
76
76
  const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
77
77
  const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
78
+ const failureModes = computeFailureModes(runs);
78
79
  const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
79
80
  const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
80
81
  const release = buildReleaseScorecard(composite, lift, contamination);
@@ -85,6 +86,7 @@ async function analyzeRuns(opts) {
85
86
  interRater,
86
87
  lift,
87
88
  failureClusters,
89
+ failureModes,
88
90
  contamination,
89
91
  outcomeCorrelation,
90
92
  priorPeriodComparison,
@@ -102,10 +104,21 @@ async function analyzeRuns(opts) {
102
104
  contamination,
103
105
  outcomeCorrelation,
104
106
  release,
107
+ ...failureModes ? { failureModes } : {},
105
108
  ...priorPeriodComparison ? { priorPeriodComparison } : {},
106
109
  recommendations
107
110
  };
108
111
  }
112
+ function computeFailureModes(runs) {
113
+ const counts = /* @__PURE__ */ new Map();
114
+ for (const r of runs) {
115
+ const key = r.failureClass ?? r.failureMode;
116
+ if (key) counts.set(key, (counts.get(key) ?? 0) + 1);
117
+ }
118
+ if (counts.size === 0) return void 0;
119
+ const n = runs.length;
120
+ return [...counts.entries()].map(([mode, count]) => ({ mode, count, share: n > 0 ? count / n : 0 })).sort((a, b) => b.count - a.count || a.mode.localeCompare(b.mode));
121
+ }
109
122
  function computePriorPeriodComparison(current, baseline, split, windowLabel) {
110
123
  if (current.length === 0 || baseline.length === 0) return void 0;
111
124
  const metrics = {};
@@ -661,6 +674,18 @@ function buildRecommendations(ctx) {
661
674
  });
662
675
  }
663
676
  }
677
+ if (ctx.failureModes && ctx.failureModes.length > 0) {
678
+ const top = ctx.failureModes[0];
679
+ if (top.count >= 3 && top.share >= 0.15) {
680
+ out.push({
681
+ priority: top.share >= 0.25 ? "high" : "medium",
682
+ kind: "investigate",
683
+ title: `'${top.mode}' is the dominant failure mode \u2014 ${top.count} runs (${(top.share * 100).toFixed(0)}% of the corpus)`,
684
+ detail: `The mean composite can look acceptable while one named failure dominates the lower tail. ${top.count} of ${ctx.composite.n} runs failed with '${top.mode}'${ctx.failureModes.length > 1 ? ` (next: '${ctx.failureModes[1].mode}' \xD7${ctx.failureModes[1].count})` : ""}. Fix this cause first.`,
685
+ evidencePath: "failureModes"
686
+ });
687
+ }
688
+ }
664
689
  if (Object.keys(ctx.judges).length === 0 && ctx.composite.n > 0) {
665
690
  out.push({
666
691
  priority: "medium",