@tangle-network/agent-eval 0.32.0 → 0.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +39 -486
- package/dist/index.js +75 -68
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +22 -12
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
|
@@ -1,148 +1,8 @@
|
|
|
1
1
|
import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-ueRVTUoY.js';
|
|
2
|
+
import { b as CalibrationResult } from './judge-calibration-DilmB3Ml.js';
|
|
2
3
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
3
4
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
4
5
|
|
|
5
|
-
/**
|
|
6
|
-
* Judge calibration — measure judge quality against human gold + bias.
|
|
7
|
-
*
|
|
8
|
-
* Workflow:
|
|
9
|
-
* 1. Build a golden set: {itemId, humanScore}[].
|
|
10
|
-
* 2. Run candidate judges; each produces {itemId, score}.
|
|
11
|
-
* 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
|
|
12
|
-
* 4. `calibrateJudgeContinuous(golden, candidate)` adds quadratic-weighted
|
|
13
|
-
* κ over the un-rounded [0,1] scores plus ICC(2,1), Pearson, Spearman,
|
|
14
|
-
* and bootstrap CIs — use this for fine-grained judges where rounding
|
|
15
|
-
* to int discards information (e.g. 0.78 vs 0.81 both round to 1 and
|
|
16
|
-
* look "perfectly agreed" to integer κ).
|
|
17
|
-
* 5. Run bias probes (positional, verbosity, self-preference) to
|
|
18
|
-
* detect systematic score inflation.
|
|
19
|
-
* 6. For N≥2 judges on the same items, `continuousAgreement(scores)`
|
|
20
|
-
* reports ICC(2,1) + κ_w + Pearson + Spearman with bootstrap CIs.
|
|
21
|
-
*
|
|
22
|
-
* Returns actionable diagnostics, not a single number. Consumers then
|
|
23
|
-
* decide whether to trust the judge, retrain it, or add a tie-breaker.
|
|
24
|
-
*/
|
|
25
|
-
interface GoldenItem {
|
|
26
|
-
itemId: string;
|
|
27
|
-
humanScore: number;
|
|
28
|
-
/** Optional group used for per-group bias audits (e.g. model-of-output family). */
|
|
29
|
-
group?: string;
|
|
30
|
-
}
|
|
31
|
-
interface CandidateScore {
|
|
32
|
-
itemId: string;
|
|
33
|
-
score: number;
|
|
34
|
-
/** Optional — enables positional-bias analysis (did order matter?). */
|
|
35
|
-
positionOfAInput?: 'first' | 'second';
|
|
36
|
-
}
|
|
37
|
-
interface CalibrationResult {
|
|
38
|
-
n: number;
|
|
39
|
-
pearson: number;
|
|
40
|
-
/** Cohen's κ with quadratic weights over integer-rounded scores. */
|
|
41
|
-
kappa: number;
|
|
42
|
-
/** Mean absolute error vs human. */
|
|
43
|
-
mae: number;
|
|
44
|
-
/** Worst-5 miscalibrations (largest |judge - human|). */
|
|
45
|
-
worstItems: Array<{
|
|
46
|
-
itemId: string;
|
|
47
|
-
judge: number;
|
|
48
|
-
human: number;
|
|
49
|
-
delta: number;
|
|
50
|
-
}>;
|
|
51
|
-
}
|
|
52
|
-
declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
|
|
53
|
-
interface PositionalBiasResult {
|
|
54
|
-
/**
|
|
55
|
-
* Score delta (first-position - second-position) averaged across items
|
|
56
|
-
* presented in both positions. Non-zero = positional bias.
|
|
57
|
-
*/
|
|
58
|
-
avgDelta: number;
|
|
59
|
-
n: number;
|
|
60
|
-
}
|
|
61
|
-
/**
|
|
62
|
-
* Feed the same items to the judge twice with A/B swapped and pass all
|
|
63
|
-
* results here. Items that don't appear in both positions are ignored.
|
|
64
|
-
*/
|
|
65
|
-
declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
|
|
66
|
-
interface VerbosityBiasResult {
|
|
67
|
-
/** Pearson correlation between output length and score. Strong positive = verbosity bias. */
|
|
68
|
-
pearson: number;
|
|
69
|
-
n: number;
|
|
70
|
-
}
|
|
71
|
-
declare function verbosityBias(samples: Array<{
|
|
72
|
-
outputLen: number;
|
|
73
|
-
score: number;
|
|
74
|
-
}>): VerbosityBiasResult;
|
|
75
|
-
interface SelfPreferenceResult {
|
|
76
|
-
/** Mean judge score when judge's family matches output's family. */
|
|
77
|
-
inFamilyMean: number;
|
|
78
|
-
outOfFamilyMean: number;
|
|
79
|
-
deltaMean: number;
|
|
80
|
-
n: number;
|
|
81
|
-
}
|
|
82
|
-
/**
|
|
83
|
-
* Pass the same scenarios scored with judge-model X grading outputs from
|
|
84
|
-
* model X (in-family) and model Y (out-of-family). Non-zero delta
|
|
85
|
-
* indicates self-preference.
|
|
86
|
-
*/
|
|
87
|
-
declare function selfPreference(samples: Array<{
|
|
88
|
-
score: number;
|
|
89
|
-
inFamily: boolean;
|
|
90
|
-
}>): SelfPreferenceResult;
|
|
91
|
-
interface ContinuousAgreement {
|
|
92
|
-
/** Cohen's κ_w with quadratic weights, computed on raw [0,1] scores. */
|
|
93
|
-
weightedKappa: number;
|
|
94
|
-
/** ICC(2,1): two-way random effects, absolute agreement, single rater. */
|
|
95
|
-
icc: number;
|
|
96
|
-
/** Pearson product-moment correlation (averaged over rater pairs if N>2). */
|
|
97
|
-
pearson: number;
|
|
98
|
-
/** Spearman rank correlation (averaged over rater pairs if N>2). */
|
|
99
|
-
spearman: number;
|
|
100
|
-
/** 95% bootstrap percentile CIs over items. */
|
|
101
|
-
ci: {
|
|
102
|
-
icc: [number, number];
|
|
103
|
-
weightedKappa: [number, number];
|
|
104
|
-
};
|
|
105
|
-
/** Number of complete items (no NaN across raters). */
|
|
106
|
-
n: number;
|
|
107
|
-
/** Number of raters. */
|
|
108
|
-
raters: number;
|
|
109
|
-
}
|
|
110
|
-
interface ContinuousAgreementOptions {
|
|
111
|
-
/** Bootstrap iterations. Default 1000. Set to 0 to skip CIs (CI = [NaN, NaN]). */
|
|
112
|
-
bootstrap?: number;
|
|
113
|
-
/** κ weighting scheme. Default 'quadratic'. */
|
|
114
|
-
weights?: 'linear' | 'quadratic';
|
|
115
|
-
/** PRNG seed for reproducible bootstrap. Default 0xC0FFEE. */
|
|
116
|
-
seed?: number;
|
|
117
|
-
/** Confidence level for percentile CI. Default 0.95. */
|
|
118
|
-
ciLevel?: number;
|
|
119
|
-
}
|
|
120
|
-
/**
|
|
121
|
-
* Inter-rater agreement on continuous (typically [0,1]) scores.
|
|
122
|
-
*
|
|
123
|
-
* `scores` has shape [n_items][n_raters]. Rows with any non-finite entry
|
|
124
|
-
* are dropped. Returns NaN metrics if fewer than 2 raters or 2 complete
|
|
125
|
-
* items remain.
|
|
126
|
-
*/
|
|
127
|
-
declare function continuousAgreement(scores: number[][], opts?: ContinuousAgreementOptions): ContinuousAgreement;
|
|
128
|
-
interface ContinuousCalibrationResult extends CalibrationResult {
|
|
129
|
-
/** Cohen's κ_w computed on raw (un-rounded) scores. */
|
|
130
|
-
weightedKappaContinuous: number;
|
|
131
|
-
/** ICC(2,1) treating golden + candidate as two raters. */
|
|
132
|
-
icc: number;
|
|
133
|
-
spearman: number;
|
|
134
|
-
ci: {
|
|
135
|
-
icc: [number, number];
|
|
136
|
-
weightedKappa: [number, number];
|
|
137
|
-
};
|
|
138
|
-
}
|
|
139
|
-
/**
|
|
140
|
-
* Drop-in superset of `calibrateJudge` that adds continuous-value
|
|
141
|
-
* agreement metrics. The old fields (n, pearson, kappa, mae, worstItems)
|
|
142
|
-
* are preserved unchanged so existing callers continue to work.
|
|
143
|
-
*/
|
|
144
|
-
declare function calibrateJudgeContinuous(golden: GoldenItem[], candidate: CandidateScore[], opts?: ContinuousAgreementOptions): ContinuousCalibrationResult;
|
|
145
|
-
|
|
146
6
|
/**
|
|
147
7
|
* Red-team battery — adversarial scenario corpus with per-category
|
|
148
8
|
* scorers.
|
|
@@ -328,4 +188,4 @@ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceRepo
|
|
|
328
188
|
|
|
329
189
|
declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
330
190
|
|
|
331
|
-
export {
|
|
191
|
+
export { DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GovernanceContext as G, type RedTeamCase as R, type UseCaseSignals as U, type GovernanceFinding as a, type GovernanceReport as b, type RedTeamCategory as c, type RedTeamFinding as d, type RedTeamPayload as e, type RedTeamReport as f, classifyEuAiRisk as g, euAiActReport as h, redTeamReport as i, renderMarkdown as j, soc2Report as k, summarize as l, nistAiRmfReport as n, redTeamDataset as r, scoreRedTeamOutput as s, toolNamesForRun as t };
|