@tangle-network/agent-eval 0.32.0 → 0.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +39 -486
- package/dist/index.js +75 -68
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +22 -12
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Judge calibration — measure judge quality against human gold + bias.
|
|
3
|
+
*
|
|
4
|
+
* Workflow:
|
|
5
|
+
* 1. Build a golden set: {itemId, humanScore}[].
|
|
6
|
+
* 2. Run candidate judges; each produces {itemId, score}.
|
|
7
|
+
* 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
|
|
8
|
+
* 4. `calibrateJudgeContinuous(golden, candidate)` adds quadratic-weighted
|
|
9
|
+
* κ over the un-rounded [0,1] scores plus ICC(2,1), Pearson, Spearman,
|
|
10
|
+
* and bootstrap CIs — use this for fine-grained judges where rounding
|
|
11
|
+
* to int discards information (e.g. 0.78 vs 0.81 both round to 1 and
|
|
12
|
+
* look "perfectly agreed" to integer κ).
|
|
13
|
+
* 5. Run bias probes (positional, verbosity, self-preference) to
|
|
14
|
+
* detect systematic score inflation.
|
|
15
|
+
* 6. For N≥2 judges on the same items, `continuousAgreement(scores)`
|
|
16
|
+
* reports ICC(2,1) + κ_w + Pearson + Spearman with bootstrap CIs.
|
|
17
|
+
*
|
|
18
|
+
* Returns actionable diagnostics, not a single number. Consumers then
|
|
19
|
+
* decide whether to trust the judge, retrain it, or add a tie-breaker.
|
|
20
|
+
*/
|
|
21
|
+
interface GoldenItem {
|
|
22
|
+
itemId: string;
|
|
23
|
+
humanScore: number;
|
|
24
|
+
/** Optional group used for per-group bias audits (e.g. model-of-output family). */
|
|
25
|
+
group?: string;
|
|
26
|
+
}
|
|
27
|
+
interface CandidateScore {
|
|
28
|
+
itemId: string;
|
|
29
|
+
score: number;
|
|
30
|
+
/** Optional — enables positional-bias analysis (did order matter?). */
|
|
31
|
+
positionOfAInput?: 'first' | 'second';
|
|
32
|
+
}
|
|
33
|
+
interface CalibrationResult {
|
|
34
|
+
n: number;
|
|
35
|
+
pearson: number;
|
|
36
|
+
/** Cohen's κ with quadratic weights over integer-rounded scores. */
|
|
37
|
+
kappa: number;
|
|
38
|
+
/** Mean absolute error vs human. */
|
|
39
|
+
mae: number;
|
|
40
|
+
/** Worst-5 miscalibrations (largest |judge - human|). */
|
|
41
|
+
worstItems: Array<{
|
|
42
|
+
itemId: string;
|
|
43
|
+
judge: number;
|
|
44
|
+
human: number;
|
|
45
|
+
delta: number;
|
|
46
|
+
}>;
|
|
47
|
+
}
|
|
48
|
+
declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
|
|
49
|
+
interface PositionalBiasResult {
|
|
50
|
+
/**
|
|
51
|
+
* Score delta (first-position - second-position) averaged across items
|
|
52
|
+
* presented in both positions. Non-zero = positional bias.
|
|
53
|
+
*/
|
|
54
|
+
avgDelta: number;
|
|
55
|
+
n: number;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Feed the same items to the judge twice with A/B swapped and pass all
|
|
59
|
+
* results here. Items that don't appear in both positions are ignored.
|
|
60
|
+
*/
|
|
61
|
+
declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
|
|
62
|
+
interface VerbosityBiasResult {
|
|
63
|
+
/** Pearson correlation between output length and score. Strong positive = verbosity bias. */
|
|
64
|
+
pearson: number;
|
|
65
|
+
n: number;
|
|
66
|
+
}
|
|
67
|
+
declare function verbosityBias(samples: Array<{
|
|
68
|
+
outputLen: number;
|
|
69
|
+
score: number;
|
|
70
|
+
}>): VerbosityBiasResult;
|
|
71
|
+
interface SelfPreferenceResult {
|
|
72
|
+
/** Mean judge score when judge's family matches output's family. */
|
|
73
|
+
inFamilyMean: number;
|
|
74
|
+
outOfFamilyMean: number;
|
|
75
|
+
deltaMean: number;
|
|
76
|
+
n: number;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Pass the same scenarios scored with judge-model X grading outputs from
|
|
80
|
+
* model X (in-family) and model Y (out-of-family). Non-zero delta
|
|
81
|
+
* indicates self-preference.
|
|
82
|
+
*/
|
|
83
|
+
declare function selfPreference(samples: Array<{
|
|
84
|
+
score: number;
|
|
85
|
+
inFamily: boolean;
|
|
86
|
+
}>): SelfPreferenceResult;
|
|
87
|
+
interface ContinuousAgreement {
|
|
88
|
+
/** Cohen's κ_w with quadratic weights, computed on raw [0,1] scores. */
|
|
89
|
+
weightedKappa: number;
|
|
90
|
+
/** ICC(2,1): two-way random effects, absolute agreement, single rater. */
|
|
91
|
+
icc: number;
|
|
92
|
+
/** Pearson product-moment correlation (averaged over rater pairs if N>2). */
|
|
93
|
+
pearson: number;
|
|
94
|
+
/** Spearman rank correlation (averaged over rater pairs if N>2). */
|
|
95
|
+
spearman: number;
|
|
96
|
+
/** 95% bootstrap percentile CIs over items. */
|
|
97
|
+
ci: {
|
|
98
|
+
icc: [number, number];
|
|
99
|
+
weightedKappa: [number, number];
|
|
100
|
+
};
|
|
101
|
+
/** Number of complete items (no NaN across raters). */
|
|
102
|
+
n: number;
|
|
103
|
+
/** Number of raters. */
|
|
104
|
+
raters: number;
|
|
105
|
+
}
|
|
106
|
+
interface ContinuousAgreementOptions {
|
|
107
|
+
/** Bootstrap iterations. Default 1000. Set to 0 to skip CIs (CI = [NaN, NaN]). */
|
|
108
|
+
bootstrap?: number;
|
|
109
|
+
/** κ weighting scheme. Default 'quadratic'. */
|
|
110
|
+
weights?: 'linear' | 'quadratic';
|
|
111
|
+
/** PRNG seed for reproducible bootstrap. Default 0xC0FFEE. */
|
|
112
|
+
seed?: number;
|
|
113
|
+
/** Confidence level for percentile CI. Default 0.95. */
|
|
114
|
+
ciLevel?: number;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Inter-rater agreement on continuous (typically [0,1]) scores.
|
|
118
|
+
*
|
|
119
|
+
* `scores` has shape [n_items][n_raters]. Rows with any non-finite entry
|
|
120
|
+
* are dropped. Returns NaN metrics if fewer than 2 raters or 2 complete
|
|
121
|
+
* items remain.
|
|
122
|
+
*/
|
|
123
|
+
declare function continuousAgreement(scores: number[][], opts?: ContinuousAgreementOptions): ContinuousAgreement;
|
|
124
|
+
interface ContinuousCalibrationResult extends CalibrationResult {
|
|
125
|
+
/** Cohen's κ_w computed on raw (un-rounded) scores. */
|
|
126
|
+
weightedKappaContinuous: number;
|
|
127
|
+
/** ICC(2,1) treating golden + candidate as two raters. */
|
|
128
|
+
icc: number;
|
|
129
|
+
spearman: number;
|
|
130
|
+
ci: {
|
|
131
|
+
icc: [number, number];
|
|
132
|
+
weightedKappa: [number, number];
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Drop-in superset of `calibrateJudge` that adds continuous-value
|
|
137
|
+
* agreement metrics. The old fields (n, pearson, kappa, mae, worstItems)
|
|
138
|
+
* are preserved unchanged so existing callers continue to work.
|
|
139
|
+
*/
|
|
140
|
+
declare function calibrateJudgeContinuous(golden: GoldenItem[], candidate: CandidateScore[], opts?: ContinuousAgreementOptions): ContinuousCalibrationResult;
|
|
141
|
+
|
|
142
|
+
export { type ContinuousAgreementOptions as C, type GoldenItem as G, type PositionalBiasResult as P, type SelfPreferenceResult as S, type VerbosityBiasResult as V, type ContinuousAgreement as a, type CalibrationResult as b, type CandidateScore as c, type ContinuousCalibrationResult as d, calibrateJudge as e, calibrateJudgeContinuous as f, continuousAgreement as g, positionalBias as p, selfPreference as s, verbosityBias as v };
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
2
2
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
3
3
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
5
|
-
import '../run-record-
|
|
4
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-BvaNwfBE.js';
|
|
5
|
+
import '../run-record-YinVdFwu.js';
|
|
6
6
|
import '../errors-mje_cKOs.js';
|
|
7
7
|
|
|
8
8
|
/**
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.33.1",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-CfnL3HEb.js';
|
|
2
2
|
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
|
|
3
|
-
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-BPJVzIeW.js';
|
|
4
|
+
import './run-record-YinVdFwu.js';
|
|
4
5
|
import './errors-mje_cKOs.js';
|
|
5
6
|
import './integrity-DYR5gWlb.js';
|
|
6
7
|
import './store-Db2Bv8Cf.js';
|
|
7
|
-
import './run-record-BfX5y68A.js';
|
|
8
8
|
import './emitter-DP_cSSiw.js';
|
|
9
9
|
import './control-runtime-BZ_lVLYW.js';
|
|
10
10
|
import './dataset-ueRVTUoY.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,14 +25,14 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
29
|
-
import "./chunk-ZN2CMQIW.js";
|
|
28
|
+
} from "./chunk-FT3IAMQR.js";
|
|
30
29
|
import {
|
|
31
30
|
runEvalCampaign
|
|
32
|
-
} from "./chunk-
|
|
33
|
-
import "./chunk-
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
31
|
+
} from "./chunk-SQYRO3BT.js";
|
|
32
|
+
import "./chunk-VXNVVBZO.js";
|
|
33
|
+
import "./chunk-DCZXFOQN.js";
|
|
34
|
+
import "./chunk-TQL7BAOY.js";
|
|
35
|
+
import "./chunk-KHZRNY3F.js";
|
|
36
36
|
import "./chunk-UBPIXOC4.js";
|
|
37
37
|
import "./chunk-PC4UYEBM.js";
|
|
38
38
|
import "./chunk-TVVP3ZZQ.js";
|
package/dist/pipelines/index.js
CHANGED
|
@@ -2,13 +2,13 @@ import {
|
|
|
2
2
|
compareToBaseline,
|
|
3
3
|
computeToolUseMetrics,
|
|
4
4
|
failureClusterView
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-KE7TDJUO.js";
|
|
6
6
|
import {
|
|
7
7
|
buildTrajectory
|
|
8
8
|
} from "../chunk-RZTMDUO7.js";
|
|
9
9
|
import {
|
|
10
10
|
interRaterReliability
|
|
11
|
-
} from "../chunk-
|
|
11
|
+
} from "../chunk-KHZRNY3F.js";
|
|
12
12
|
import {
|
|
13
13
|
aggregateLlm,
|
|
14
14
|
argHash,
|