@tangle-network/agent-eval 0.32.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +30 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/chunk-DCZXFOQN.js +489 -0
  4. package/dist/chunk-DCZXFOQN.js.map +1 -0
  5. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  6. package/dist/chunk-FT3IAMQR.js.map +1 -0
  7. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  8. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  9. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  10. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  11. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  12. package/dist/chunk-SQYRO3BT.js.map +1 -0
  13. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  14. package/dist/chunk-TQL7BAOY.js.map +1 -0
  15. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  16. package/dist/chunk-VXNVVBZO.js.map +1 -0
  17. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  18. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  19. package/dist/cli.js +2 -2
  20. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +3 -2
  23. package/dist/governance/index.d.ts +2 -1
  24. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  25. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  26. package/dist/index.d.ts +39 -486
  27. package/dist/index.js +75 -68
  28. package/dist/index.js.map +1 -1
  29. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  30. package/dist/meta-eval/index.d.ts +2 -2
  31. package/dist/openapi.json +1 -1
  32. package/dist/optimization.d.ts +3 -3
  33. package/dist/optimization.js +6 -6
  34. package/dist/pipelines/index.js +2 -2
  35. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  36. package/dist/reporting.d.ts +6 -4
  37. package/dist/reporting.js +10 -9
  38. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  39. package/dist/rl.d.ts +5 -5
  40. package/dist/rl.js +6 -6
  41. package/dist/rl.js.map +1 -1
  42. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  43. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  44. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  45. package/dist/wire/index.js +2 -2
  46. package/docs/product-eval-adoption.md +18 -0
  47. package/package.json +22 -12
  48. package/dist/chunk-B73G44OH.js.map +0 -1
  49. package/dist/chunk-CXJOVDJR.js.map +0 -1
  50. package/dist/chunk-DTEJNZYK.js.map +0 -1
  51. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  52. package/dist/chunk-ZN2CMQIW.js +0 -208
  53. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  54. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  55. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  56. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  57. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  58. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
@@ -1,148 +1,8 @@
1
1
  import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-ueRVTUoY.js';
2
+ import { b as CalibrationResult } from './judge-calibration-DilmB3Ml.js';
2
3
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
4
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
4
5
 
5
- /**
6
- * Judge calibration — measure judge quality against human gold + bias.
7
- *
8
- * Workflow:
9
- * 1. Build a golden set: {itemId, humanScore}[].
10
- * 2. Run candidate judges; each produces {itemId, score}.
11
- * 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
12
- * 4. `calibrateJudgeContinuous(golden, candidate)` adds quadratic-weighted
13
- * κ over the un-rounded [0,1] scores plus ICC(2,1), Pearson, Spearman,
14
- * and bootstrap CIs — use this for fine-grained judges where rounding
15
- * to int discards information (e.g. 0.78 vs 0.81 both round to 1 and
16
- * look "perfectly agreed" to integer κ).
17
- * 5. Run bias probes (positional, verbosity, self-preference) to
18
- * detect systematic score inflation.
19
- * 6. For N≥2 judges on the same items, `continuousAgreement(scores)`
20
- * reports ICC(2,1) + κ_w + Pearson + Spearman with bootstrap CIs.
21
- *
22
- * Returns actionable diagnostics, not a single number. Consumers then
23
- * decide whether to trust the judge, retrain it, or add a tie-breaker.
24
- */
25
- interface GoldenItem {
26
- itemId: string;
27
- humanScore: number;
28
- /** Optional group used for per-group bias audits (e.g. model-of-output family). */
29
- group?: string;
30
- }
31
- interface CandidateScore {
32
- itemId: string;
33
- score: number;
34
- /** Optional — enables positional-bias analysis (did order matter?). */
35
- positionOfAInput?: 'first' | 'second';
36
- }
37
- interface CalibrationResult {
38
- n: number;
39
- pearson: number;
40
- /** Cohen's κ with quadratic weights over integer-rounded scores. */
41
- kappa: number;
42
- /** Mean absolute error vs human. */
43
- mae: number;
44
- /** Worst-5 miscalibrations (largest |judge - human|). */
45
- worstItems: Array<{
46
- itemId: string;
47
- judge: number;
48
- human: number;
49
- delta: number;
50
- }>;
51
- }
52
- declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
53
- interface PositionalBiasResult {
54
- /**
55
- * Score delta (first-position - second-position) averaged across items
56
- * presented in both positions. Non-zero = positional bias.
57
- */
58
- avgDelta: number;
59
- n: number;
60
- }
61
- /**
62
- * Feed the same items to the judge twice with A/B swapped and pass all
63
- * results here. Items that don't appear in both positions are ignored.
64
- */
65
- declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
66
- interface VerbosityBiasResult {
67
- /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
68
- pearson: number;
69
- n: number;
70
- }
71
- declare function verbosityBias(samples: Array<{
72
- outputLen: number;
73
- score: number;
74
- }>): VerbosityBiasResult;
75
- interface SelfPreferenceResult {
76
- /** Mean judge score when judge's family matches output's family. */
77
- inFamilyMean: number;
78
- outOfFamilyMean: number;
79
- deltaMean: number;
80
- n: number;
81
- }
82
- /**
83
- * Pass the same scenarios scored with judge-model X grading outputs from
84
- * model X (in-family) and model Y (out-of-family). Non-zero delta
85
- * indicates self-preference.
86
- */
87
- declare function selfPreference(samples: Array<{
88
- score: number;
89
- inFamily: boolean;
90
- }>): SelfPreferenceResult;
91
- interface ContinuousAgreement {
92
- /** Cohen's κ_w with quadratic weights, computed on raw [0,1] scores. */
93
- weightedKappa: number;
94
- /** ICC(2,1): two-way random effects, absolute agreement, single rater. */
95
- icc: number;
96
- /** Pearson product-moment correlation (averaged over rater pairs if N>2). */
97
- pearson: number;
98
- /** Spearman rank correlation (averaged over rater pairs if N>2). */
99
- spearman: number;
100
- /** 95% bootstrap percentile CIs over items. */
101
- ci: {
102
- icc: [number, number];
103
- weightedKappa: [number, number];
104
- };
105
- /** Number of complete items (no NaN across raters). */
106
- n: number;
107
- /** Number of raters. */
108
- raters: number;
109
- }
110
- interface ContinuousAgreementOptions {
111
- /** Bootstrap iterations. Default 1000. Set to 0 to skip CIs (CI = [NaN, NaN]). */
112
- bootstrap?: number;
113
- /** κ weighting scheme. Default 'quadratic'. */
114
- weights?: 'linear' | 'quadratic';
115
- /** PRNG seed for reproducible bootstrap. Default 0xC0FFEE. */
116
- seed?: number;
117
- /** Confidence level for percentile CI. Default 0.95. */
118
- ciLevel?: number;
119
- }
120
- /**
121
- * Inter-rater agreement on continuous (typically [0,1]) scores.
122
- *
123
- * `scores` has shape [n_items][n_raters]. Rows with any non-finite entry
124
- * are dropped. Returns NaN metrics if fewer than 2 raters or 2 complete
125
- * items remain.
126
- */
127
- declare function continuousAgreement(scores: number[][], opts?: ContinuousAgreementOptions): ContinuousAgreement;
128
- interface ContinuousCalibrationResult extends CalibrationResult {
129
- /** Cohen's κ_w computed on raw (un-rounded) scores. */
130
- weightedKappaContinuous: number;
131
- /** ICC(2,1) treating golden + candidate as two raters. */
132
- icc: number;
133
- spearman: number;
134
- ci: {
135
- icc: [number, number];
136
- weightedKappa: [number, number];
137
- };
138
- }
139
- /**
140
- * Drop-in superset of `calibrateJudge` that adds continuous-value
141
- * agreement metrics. The old fields (n, pearson, kappa, mae, worstItems)
142
- * are preserved unchanged so existing callers continue to work.
143
- */
144
- declare function calibrateJudgeContinuous(golden: GoldenItem[], candidate: CandidateScore[], opts?: ContinuousAgreementOptions): ContinuousCalibrationResult;
145
-
146
6
  /**
147
7
  * Red-team battery — adversarial scenario corpus with per-category
148
8
  * scorers.
@@ -328,4 +188,4 @@ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceRepo
328
188
 
329
189
  declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
330
190
 
331
- export { verbosityBias as A, type ContinuousAgreementOptions as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type ContinuousAgreement as a, type CalibrationResult as b, type CandidateScore as c, type ContinuousCalibrationResult as d, type GovernanceContext as e, type GovernanceFinding as f, type GovernanceReport as g, type RedTeamCategory as h, type RedTeamFinding as i, type RedTeamPayload as j, type RedTeamReport as k, calibrateJudge as l, calibrateJudgeContinuous as m, classifyEuAiRisk as n, continuousAgreement as o, euAiActReport as p, nistAiRmfReport as q, positionalBias as r, redTeamDataset as s, redTeamReport as t, renderMarkdown as u, scoreRedTeamOutput as v, selfPreference as w, soc2Report as x, summarize as y, toolNamesForRun as z };
191
+ export { DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GovernanceContext as G, type RedTeamCase as R, type UseCaseSignals as U, type GovernanceFinding as a, type GovernanceReport as b, type RedTeamCategory as c, type RedTeamFinding as d, type RedTeamPayload as e, type RedTeamReport as f, classifyEuAiRisk as g, euAiActReport as h, redTeamReport as i, renderMarkdown as j, soc2Report as k, summarize as l, nistAiRmfReport as n, redTeamDataset as r, scoreRedTeamOutput as s, toolNamesForRun as t };
@@ -1,4 +1,4 @@
1
- import { a as RunSplitTag } from './run-record-BfX5y68A.js';
1
+ import { a as RunSplitTag } from './run-record-YinVdFwu.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under