@tangle-network/agent-eval 0.32.0 → 0.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +39 -486
- package/dist/index.js +75 -68
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +22 -12
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/reporting.d.ts
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-BvaNwfBE.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-ChfmCmLi.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
4
|
+
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-BPJVzIeW.js';
|
|
5
|
+
import './run-record-YinVdFwu.js';
|
|
6
6
|
import './errors-mje_cKOs.js';
|
|
7
7
|
import './outcome-store-D6KWmYvj.js';
|
|
8
|
+
import './judge-calibration-DilmB3Ml.js';
|
|
9
|
+
import '@tangle-network/tcloud';
|
|
8
10
|
import './dataset-ueRVTUoY.js';
|
|
9
11
|
import './failure-cluster-Cw65_5FY.js';
|
|
10
12
|
import './store-Db2Bv8Cf.js';
|
package/dist/reporting.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
7
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-LGAPK7NA.js";
|
|
9
9
|
import {
|
|
10
10
|
rubricPredictiveValidity
|
|
11
11
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -15,22 +15,23 @@ import {
|
|
|
15
15
|
} from "./chunk-MAZ26DC7.js";
|
|
16
16
|
import {
|
|
17
17
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
18
|
-
bhAdjust,
|
|
19
18
|
gainHistogram,
|
|
20
|
-
pairedBootstrap,
|
|
21
|
-
pairedWilcoxon,
|
|
22
19
|
paretoChart,
|
|
23
20
|
researchReport,
|
|
24
21
|
summaryTable
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import
|
|
22
|
+
} from "./chunk-TQL7BAOY.js";
|
|
23
|
+
import {
|
|
24
|
+
benjaminiHochberg,
|
|
25
|
+
pairedBootstrap,
|
|
26
|
+
wilcoxonSignedRank
|
|
27
|
+
} from "./chunk-KHZRNY3F.js";
|
|
27
28
|
import "./chunk-VSMTAMNK.js";
|
|
28
29
|
import "./chunk-QYJT52YW.js";
|
|
29
30
|
import "./chunk-PZ5AY32C.js";
|
|
30
31
|
export {
|
|
31
32
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
32
33
|
assertReleaseConfidence,
|
|
33
|
-
|
|
34
|
+
benjaminiHochberg,
|
|
34
35
|
bootstrapCi,
|
|
35
36
|
evaluateInterimReleaseConfidence,
|
|
36
37
|
evaluateReleaseConfidence,
|
|
@@ -38,12 +39,12 @@ export {
|
|
|
38
39
|
judgeReplayGate,
|
|
39
40
|
pairedBootstrap,
|
|
40
41
|
pairedEvalueSequence,
|
|
41
|
-
pairedWilcoxon,
|
|
42
42
|
paretoChart,
|
|
43
43
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
44
44
|
renderReleaseReport,
|
|
45
45
|
researchReport,
|
|
46
46
|
rubricPredictiveValidity,
|
|
47
|
-
summaryTable
|
|
47
|
+
summaryTable,
|
|
48
|
+
wilcoxonSignedRank
|
|
48
49
|
};
|
|
49
50
|
//# sourceMappingURL=reporting.js.map
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-YinVdFwu.js';
|
|
1
2
|
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
3
|
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
|
|
3
|
-
import {
|
|
4
|
-
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-D7AQS7eB.js';
|
|
4
|
+
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-BPJVzIeW.js';
|
|
5
5
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
6
6
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
|
|
@@ -126,6 +126,21 @@ interface LlmClientOptions {
|
|
|
126
126
|
/** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
|
|
127
127
|
redactor?: ProviderRedactor;
|
|
128
128
|
}
|
|
129
|
+
/**
|
|
130
|
+
* True when an error is a transient transport/network fault worth retrying,
|
|
131
|
+
* as opposed to a deterministic failure (4xx schema reject, JSON parse) that
|
|
132
|
+
* a retry cannot fix. Inspects `LlmCallError.status`, then the error's
|
|
133
|
+
* name/message/code, then recurses into `error.cause` — undici nests the
|
|
134
|
+
* real socket fault one or more levels under `.cause`.
|
|
135
|
+
*
|
|
136
|
+
* This is THE retry classifier for the package: `callLlm` and
|
|
137
|
+
* `withJudgeRetry` both route through it, so a connection-class error is
|
|
138
|
+
* treated identically whether it surfaces in the HTTP client or a
|
|
139
|
+
* TCloud-backed judge.
|
|
140
|
+
*/
|
|
141
|
+
declare function isTransientLlmError(err: unknown): boolean;
|
|
142
|
+
/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
|
|
143
|
+
declare function backoffMs(attempt: number): number;
|
|
129
144
|
/**
|
|
130
145
|
* Strip a ```json / ``` code fence if the model emitted one.
|
|
131
146
|
* Idempotent for naked JSON. Some models (claude-code via router, certain
|
|
@@ -322,6 +337,12 @@ interface CampaignRunOutcome {
|
|
|
322
337
|
* Single-judge or scalar-only runs leave this unset.
|
|
323
338
|
*/
|
|
324
339
|
judgeScores?: JudgeScoresRecord;
|
|
340
|
+
/**
|
|
341
|
+
* Agent profile cell observed by the runner. When supplied, it overrides
|
|
342
|
+
* `EvalCampaignOptions.agentProfile` for this run and must match the
|
|
343
|
+
* outcome's `model` and `promptHash`.
|
|
344
|
+
*/
|
|
345
|
+
agentProfile?: AgentProfileCell | AgentProfileCellInput;
|
|
325
346
|
}
|
|
326
347
|
type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
|
|
327
348
|
type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
|
|
@@ -407,6 +428,16 @@ interface EvalCampaignOptions<V> {
|
|
|
407
428
|
now?: () => number;
|
|
408
429
|
/** Override the runId generator. Tests pin this. */
|
|
409
430
|
runId?: (params: CampaignFactoryParams) => string;
|
|
431
|
+
/**
|
|
432
|
+
* Agent profile cell for campaign runs. Static profiles can pass an object;
|
|
433
|
+
* routers or variant-specific harnesses can pass a factory. The campaign
|
|
434
|
+
* stamps the built cell onto every `RunRecord` and rejects profile/model or
|
|
435
|
+
* profile/prompt contradictions.
|
|
436
|
+
*/
|
|
437
|
+
agentProfile?: AgentProfileCell | AgentProfileCellInput | ((params: CampaignFactoryParams & {
|
|
438
|
+
variant: V;
|
|
439
|
+
scenarioTags: Record<string, string>;
|
|
440
|
+
}) => AgentProfileCell | AgentProfileCellInput | Promise<AgentProfileCell | AgentProfileCellInput>);
|
|
410
441
|
}
|
|
411
442
|
interface CampaignFactoryParams {
|
|
412
443
|
campaignId: string;
|
|
@@ -577,4 +608,4 @@ declare class NoopResearcher implements Researcher {
|
|
|
577
608
|
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
578
609
|
}
|
|
579
610
|
|
|
580
|
-
export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v,
|
|
611
|
+
export { probeLlm as A, stripFencedJson as B, CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, backoffMs as w, callLlm as x, callLlmJson as y, isTransientLlmError as z };
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-YinVdFwu.js';
|
|
2
2
|
import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
|
|
3
|
-
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-
|
|
3
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-BPJVzIeW.js';
|
|
4
4
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
5
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-BvaNwfBE.js';
|
|
6
6
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
7
7
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-
|
|
8
|
+
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-CfnL3HEb.js';
|
|
9
|
+
export { r as runEvalCampaign } from './researcher-CfnL3HEb.js';
|
|
10
10
|
import './errors-mje_cKOs.js';
|
|
11
11
|
import './failure-cluster-Cw65_5FY.js';
|
|
12
12
|
import './integrity-DYR5gWlb.js';
|
package/dist/rl.js
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
4
|
-
import "./chunk-
|
|
3
|
+
} from "./chunk-SQYRO3BT.js";
|
|
4
|
+
import "./chunk-VXNVVBZO.js";
|
|
5
|
+
import "./chunk-DCZXFOQN.js";
|
|
5
6
|
import {
|
|
6
7
|
rubricPredictiveValidity
|
|
7
8
|
} from "./chunk-YRZ4M5GS.js";
|
|
8
9
|
import {
|
|
9
10
|
evaluateInterimReleaseConfidence
|
|
10
11
|
} from "./chunk-MAZ26DC7.js";
|
|
12
|
+
import "./chunk-TQL7BAOY.js";
|
|
11
13
|
import {
|
|
12
|
-
benjaminiHochberg
|
|
13
|
-
} from "./chunk-CXJOVDJR.js";
|
|
14
|
-
import {
|
|
14
|
+
benjaminiHochberg,
|
|
15
15
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
16
|
+
} from "./chunk-KHZRNY3F.js";
|
|
17
17
|
import "./chunk-UBPIXOC4.js";
|
|
18
18
|
import "./chunk-PC4UYEBM.js";
|
|
19
19
|
import "./chunk-TVVP3ZZQ.js";
|