@tangle-network/agent-eval 0.33.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +33 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/chunk-DCZXFOQN.js +489 -0
  4. package/dist/chunk-DCZXFOQN.js.map +1 -0
  5. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  6. package/dist/chunk-FT3IAMQR.js.map +1 -0
  7. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  8. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  9. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  10. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  11. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  12. package/dist/chunk-SQYRO3BT.js.map +1 -0
  13. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  14. package/dist/chunk-TQL7BAOY.js.map +1 -0
  15. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  16. package/dist/chunk-VXNVVBZO.js.map +1 -0
  17. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  18. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  19. package/dist/cli.js +2 -2
  20. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +3 -2
  23. package/dist/governance/index.d.ts +2 -1
  24. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  25. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  26. package/dist/index.d.ts +278 -486
  27. package/dist/index.js +522 -134
  28. package/dist/index.js.map +1 -1
  29. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  30. package/dist/meta-eval/index.d.ts +2 -2
  31. package/dist/openapi.json +1 -1
  32. package/dist/optimization.d.ts +3 -3
  33. package/dist/optimization.js +6 -6
  34. package/dist/pipelines/index.js +2 -2
  35. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  36. package/dist/reporting.d.ts +6 -4
  37. package/dist/reporting.js +10 -9
  38. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  39. package/dist/rl.d.ts +5 -5
  40. package/dist/rl.js +6 -6
  41. package/dist/rl.js.map +1 -1
  42. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  43. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  44. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  45. package/dist/wire/index.js +2 -2
  46. package/docs/product-eval-adoption.md +18 -0
  47. package/package.json +12 -22
  48. package/dist/chunk-B73G44OH.js.map +0 -1
  49. package/dist/chunk-CXJOVDJR.js.map +0 -1
  50. package/dist/chunk-DTEJNZYK.js.map +0 -1
  51. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  52. package/dist/chunk-ZN2CMQIW.js +0 -208
  53. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  54. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  55. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  56. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  57. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  58. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
@@ -1,10 +1,12 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CMHypZ_M.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-BvaNwfBE.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-ChfmCmLi.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-D7AQS7eB.js';
5
- import './run-record-BfX5y68A.js';
4
+ export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-BPJVzIeW.js';
5
+ import './run-record-YinVdFwu.js';
6
6
  import './errors-mje_cKOs.js';
7
7
  import './outcome-store-D6KWmYvj.js';
8
+ import './judge-calibration-DilmB3Ml.js';
9
+ import '@tangle-network/tcloud';
8
10
  import './dataset-ueRVTUoY.js';
9
11
  import './failure-cluster-Cw65_5FY.js';
10
12
  import './store-Db2Bv8Cf.js';
package/dist/reporting.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  judgeReplayGate,
6
6
  releaseTraceEvidenceFromMultiShotTrials,
7
7
  renderReleaseReport
8
- } from "./chunk-WGXZAQLR.js";
8
+ } from "./chunk-LGAPK7NA.js";
9
9
  import {
10
10
  rubricPredictiveValidity
11
11
  } from "./chunk-YRZ4M5GS.js";
@@ -15,22 +15,23 @@ import {
15
15
  } from "./chunk-MAZ26DC7.js";
16
16
  import {
17
17
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
18
- bhAdjust,
19
18
  gainHistogram,
20
- pairedBootstrap,
21
- pairedWilcoxon,
22
19
  paretoChart,
23
20
  researchReport,
24
21
  summaryTable
25
- } from "./chunk-CXJOVDJR.js";
26
- import "./chunk-4L3WJXQJ.js";
22
+ } from "./chunk-TQL7BAOY.js";
23
+ import {
24
+ benjaminiHochberg,
25
+ pairedBootstrap,
26
+ wilcoxonSignedRank
27
+ } from "./chunk-KHZRNY3F.js";
27
28
  import "./chunk-VSMTAMNK.js";
28
29
  import "./chunk-QYJT52YW.js";
29
30
  import "./chunk-PZ5AY32C.js";
30
31
  export {
31
32
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
32
33
  assertReleaseConfidence,
33
- bhAdjust,
34
+ benjaminiHochberg,
34
35
  bootstrapCi,
35
36
  evaluateInterimReleaseConfidence,
36
37
  evaluateReleaseConfidence,
@@ -38,12 +39,12 @@ export {
38
39
  judgeReplayGate,
39
40
  pairedBootstrap,
40
41
  pairedEvalueSequence,
41
- pairedWilcoxon,
42
42
  paretoChart,
43
43
  releaseTraceEvidenceFromMultiShotTrials,
44
44
  renderReleaseReport,
45
45
  researchReport,
46
46
  rubricPredictiveValidity,
47
- summaryTable
47
+ summaryTable,
48
+ wilcoxonSignedRank
48
49
  };
49
50
  //# sourceMappingURL=reporting.js.map
@@ -1,7 +1,7 @@
1
+ import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-YinVdFwu.js';
1
2
  import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
3
  import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
3
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, R as RunRecord } from './run-record-BfX5y68A.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-D7AQS7eB.js';
4
+ import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-BPJVzIeW.js';
5
5
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
6
6
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
 
@@ -126,6 +126,21 @@ interface LlmClientOptions {
126
126
  /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
127
127
  redactor?: ProviderRedactor;
128
128
  }
129
+ /**
130
+ * True when an error is a transient transport/network fault worth retrying,
131
+ * as opposed to a deterministic failure (4xx schema reject, JSON parse) that
132
+ * a retry cannot fix. Inspects `LlmCallError.status`, then the error's
133
+ * name/message/code, then recurses into `error.cause` — undici nests the
134
+ * real socket fault one or more levels under `.cause`.
135
+ *
136
+ * This is THE retry classifier for the package: `callLlm` and
137
+ * `withJudgeRetry` both route through it, so a connection-class error is
138
+ * treated identically whether it surfaces in the HTTP client or a
139
+ * TCloud-backed judge.
140
+ */
141
+ declare function isTransientLlmError(err: unknown): boolean;
142
+ /** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
143
+ declare function backoffMs(attempt: number): number;
129
144
  /**
130
145
  * Strip a ```json / ``` code fence if the model emitted one.
131
146
  * Idempotent for naked JSON. Some models (claude-code via router, certain
@@ -322,6 +337,12 @@ interface CampaignRunOutcome {
322
337
  * Single-judge or scalar-only runs leave this unset.
323
338
  */
324
339
  judgeScores?: JudgeScoresRecord;
340
+ /**
341
+ * Agent profile cell observed by the runner. When supplied, it overrides
342
+ * `EvalCampaignOptions.agentProfile` for this run and must match the
343
+ * outcome's `model` and `promptHash`.
344
+ */
345
+ agentProfile?: AgentProfileCell | AgentProfileCellInput;
325
346
  }
326
347
  type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
327
348
  type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
@@ -407,6 +428,16 @@ interface EvalCampaignOptions<V> {
407
428
  now?: () => number;
408
429
  /** Override the runId generator. Tests pin this. */
409
430
  runId?: (params: CampaignFactoryParams) => string;
431
+ /**
432
+ * Agent profile cell for campaign runs. Static profiles can pass an object;
433
+ * routers or variant-specific harnesses can pass a factory. The campaign
434
+ * stamps the built cell onto every `RunRecord` and rejects profile/model or
435
+ * profile/prompt contradictions.
436
+ */
437
+ agentProfile?: AgentProfileCell | AgentProfileCellInput | ((params: CampaignFactoryParams & {
438
+ variant: V;
439
+ scenarioTags: Record<string, string>;
440
+ }) => AgentProfileCell | AgentProfileCellInput | Promise<AgentProfileCell | AgentProfileCellInput>);
410
441
  }
411
442
  interface CampaignFactoryParams {
412
443
  campaignId: string;
@@ -577,4 +608,4 @@ declare class NoopResearcher implements Researcher {
577
608
  evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
578
609
  }
579
610
 
580
- export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, callLlm as w, callLlmJson as x, probeLlm as y, stripFencedJson as z };
611
+ export { probeLlm as A, stripFencedJson as B, CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, backoffMs as w, callLlm as x, callLlmJson as y, isTransientLlmError as z };
package/dist/rl.d.ts CHANGED
@@ -1,12 +1,12 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-YinVdFwu.js';
2
2
  import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-D7AQS7eB.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-BPJVzIeW.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CMHypZ_M.js';
5
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-BvaNwfBE.js';
6
6
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
7
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BRHa5Jxo.js';
9
- export { r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-CfnL3HEb.js';
9
+ export { r as runEvalCampaign } from './researcher-CfnL3HEb.js';
10
10
  import './errors-mje_cKOs.js';
11
11
  import './failure-cluster-Cw65_5FY.js';
12
12
  import './integrity-DYR5gWlb.js';
package/dist/rl.js CHANGED
@@ -1,19 +1,19 @@
1
1
  import {
2
2
  runEvalCampaign
3
- } from "./chunk-DTEJNZYK.js";
4
- import "./chunk-M6RZ5LJN.js";
3
+ } from "./chunk-SQYRO3BT.js";
4
+ import "./chunk-VXNVVBZO.js";
5
+ import "./chunk-DCZXFOQN.js";
5
6
  import {
6
7
  rubricPredictiveValidity
7
8
  } from "./chunk-YRZ4M5GS.js";
8
9
  import {
9
10
  evaluateInterimReleaseConfidence
10
11
  } from "./chunk-MAZ26DC7.js";
12
+ import "./chunk-TQL7BAOY.js";
11
13
  import {
12
- benjaminiHochberg
13
- } from "./chunk-CXJOVDJR.js";
14
- import {
14
+ benjaminiHochberg,
15
15
  wilcoxonSignedRank
16
- } from "./chunk-4L3WJXQJ.js";
16
+ } from "./chunk-KHZRNY3F.js";
17
17
  import "./chunk-UBPIXOC4.js";
18
18
  import "./chunk-PC4UYEBM.js";
19
19
  import "./chunk-TVVP3ZZQ.js";