@tangle-network/agent-eval 0.53.0 → 0.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +7 -6
- package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/builder-eval/index.d.ts +4 -3
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +33 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
- package/dist/{chunk-5KSDYBYH.js → chunk-LYL4SOKT.js} +3 -2
- package/dist/chunk-LYL4SOKT.js.map +1 -0
- package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
- package/dist/chunk-NCK5QLGT.js.map +1 -0
- package/dist/contract/index.d.ts +13 -12
- package/dist/contract/index.js +25 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
- package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
- package/dist/control.d.ts +7 -6
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
- package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
- package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -2
- package/dist/hosted/index.d.ts +7 -6
- package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
- package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
- package/dist/index.d.ts +31 -29
- package/dist/index.js +3 -3
- package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
- package/dist/knowledge/index.d.ts +4 -3
- package/dist/meta-eval/index.d.ts +4 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.d.ts +7 -6
- package/dist/prm/index.d.ts +5 -4
- package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
- package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
- package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
- package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
- package/dist/reporting.d.ts +7 -6
- package/dist/{researcher-LZD0qHEa.d.ts → researcher-JP8EvnLv.d.ts} +11 -6
- package/dist/rl.d.ts +11 -10
- package/dist/rl.js +2 -2
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
- package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
- package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
- package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
- package/dist/store-CKUAgsJz.d.ts +101 -0
- package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
- package/dist/traces.d.ts +7 -6
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
- package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
- package/dist/wire/index.d.ts +5 -4
- package/docs/pilot/README.md +62 -0
- package/docs/pilot/customer-checklist.md +90 -0
- package/docs/pilot/integration-foreign-stack.md +296 -0
- package/docs/pilot/integration-tangle-stack.md +248 -0
- package/docs/pilot/one-pager.md +161 -0
- package/docs/pilot/sample-insight-report.json +172 -0
- package/docs/research/research-roadmap.md +204 -0
- package/package.json +1 -1
- package/dist/chunk-5KSDYBYH.js.map +0 -1
- package/dist/chunk-BWZEGTES.js.map +0 -1
- /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-
|
|
2
|
-
export {
|
|
3
|
-
import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-
|
|
4
|
-
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-
|
|
1
|
+
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-BgrxOJSf.js';
|
|
2
|
+
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, w as SessionScript } from '../types-BgrxOJSf.js';
|
|
3
|
+
import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-BhfdjrMY.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-BhfdjrMY.js';
|
|
5
5
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
6
|
-
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-
|
|
7
|
-
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-
|
|
8
|
-
import { A as AnalystRegistry } from '../registry-
|
|
6
|
+
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-D2nT6_KT.js';
|
|
7
|
+
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-D2nT6_KT.js';
|
|
8
|
+
import { A as AnalystRegistry } from '../registry-BSWy0rvH.js';
|
|
9
9
|
import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
|
|
10
|
-
import { R as RunRecord, a as RunSplitTag } from '../run-record-
|
|
10
|
+
import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
|
|
11
11
|
import '../llm-client-BXVRUZyX.js';
|
|
12
12
|
import '../errors-mje_cKOs.js';
|
|
13
13
|
import '../raw-provider-sink-C46HDghv.js';
|
|
14
|
-
import '../red-team-
|
|
15
|
-
import '../store-
|
|
16
|
-
import '../
|
|
17
|
-
import '../
|
|
14
|
+
import '../red-team-CrC5MZYd.js';
|
|
15
|
+
import '../store-CKUAgsJz.js';
|
|
16
|
+
import '../schema-m0gsnbt3.js';
|
|
17
|
+
import '../summary-report-DLxh4yWk.js';
|
|
18
|
+
import '../failure-cluster-CL7IVgkJ.js';
|
|
18
19
|
import '../judge-calibration-DilmB3Ml.js';
|
|
19
20
|
import '../store-CJbzDxZ2.js';
|
|
20
21
|
import '../types-DhqpAi_z.js';
|
package/dist/contract/index.js
CHANGED
|
@@ -75,6 +75,7 @@ async function analyzeRuns(opts) {
|
|
|
75
75
|
const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
|
|
76
76
|
const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
|
|
77
77
|
const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
|
|
78
|
+
const failureModes = computeFailureModes(runs);
|
|
78
79
|
const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
|
|
79
80
|
const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
|
|
80
81
|
const release = buildReleaseScorecard(composite, lift, contamination);
|
|
@@ -85,6 +86,7 @@ async function analyzeRuns(opts) {
|
|
|
85
86
|
interRater,
|
|
86
87
|
lift,
|
|
87
88
|
failureClusters,
|
|
89
|
+
failureModes,
|
|
88
90
|
contamination,
|
|
89
91
|
outcomeCorrelation,
|
|
90
92
|
priorPeriodComparison,
|
|
@@ -102,10 +104,21 @@ async function analyzeRuns(opts) {
|
|
|
102
104
|
contamination,
|
|
103
105
|
outcomeCorrelation,
|
|
104
106
|
release,
|
|
107
|
+
...failureModes ? { failureModes } : {},
|
|
105
108
|
...priorPeriodComparison ? { priorPeriodComparison } : {},
|
|
106
109
|
recommendations
|
|
107
110
|
};
|
|
108
111
|
}
|
|
112
|
+
function computeFailureModes(runs) {
|
|
113
|
+
const counts = /* @__PURE__ */ new Map();
|
|
114
|
+
for (const r of runs) {
|
|
115
|
+
const key = r.failureClass ?? r.failureMode;
|
|
116
|
+
if (key) counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
117
|
+
}
|
|
118
|
+
if (counts.size === 0) return void 0;
|
|
119
|
+
const n = runs.length;
|
|
120
|
+
return [...counts.entries()].map(([mode, count]) => ({ mode, count, share: n > 0 ? count / n : 0 })).sort((a, b) => b.count - a.count || a.mode.localeCompare(b.mode));
|
|
121
|
+
}
|
|
109
122
|
function computePriorPeriodComparison(current, baseline, split, windowLabel) {
|
|
110
123
|
if (current.length === 0 || baseline.length === 0) return void 0;
|
|
111
124
|
const metrics = {};
|
|
@@ -661,6 +674,18 @@ function buildRecommendations(ctx) {
|
|
|
661
674
|
});
|
|
662
675
|
}
|
|
663
676
|
}
|
|
677
|
+
if (ctx.failureModes && ctx.failureModes.length > 0) {
|
|
678
|
+
const top = ctx.failureModes[0];
|
|
679
|
+
if (top.count >= 3 && top.share >= 0.15) {
|
|
680
|
+
out.push({
|
|
681
|
+
priority: top.share >= 0.25 ? "high" : "medium",
|
|
682
|
+
kind: "investigate",
|
|
683
|
+
title: `'${top.mode}' is the dominant failure mode \u2014 ${top.count} runs (${(top.share * 100).toFixed(0)}% of the corpus)`,
|
|
684
|
+
detail: `The mean composite can look acceptable while one named failure dominates the lower tail. ${top.count} of ${ctx.composite.n} runs failed with '${top.mode}'${ctx.failureModes.length > 1 ? ` (next: '${ctx.failureModes[1].mode}' \xD7${ctx.failureModes[1].count})` : ""}. Fix this cause first.`,
|
|
685
|
+
evidencePath: "failureModes"
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
}
|
|
664
689
|
if (Object.keys(ctx.judges).length === 0 && ctx.composite.n > 0) {
|
|
665
690
|
out.push({
|
|
666
691
|
priority: "medium",
|