@tangle-network/agent-eval 0.60.0 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +5 -5
  5. package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
  6. package/dist/benchmarks/index.d.ts +3 -3
  7. package/dist/builder-eval/index.js +2 -2
  8. package/dist/campaign/index.d.ts +151 -11
  9. package/dist/campaign/index.js +211 -10
  10. package/dist/campaign/index.js.map +1 -1
  11. package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
  12. package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
  13. package/dist/chunk-3BFEG2F6.js.map +1 -0
  14. package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
  15. package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
  16. package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
  17. package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
  18. package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
  19. package/dist/{chunk-GBHRUAOF.js → chunk-GMXHLSLL.js} +2 -2
  20. package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
  21. package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
  22. package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
  23. package/dist/{chunk-NOPYCRNG.js → chunk-OLULBECP.js} +13 -2
  24. package/dist/chunk-OLULBECP.js.map +1 -0
  25. package/dist/chunk-PQV2TKC3.js +27 -0
  26. package/dist/chunk-PQV2TKC3.js.map +1 -0
  27. package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
  28. package/dist/{chunk-LBSXXH56.js → chunk-SUGME4OT.js} +5 -5
  29. package/dist/chunk-SUGME4OT.js.map +1 -0
  30. package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
  31. package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
  32. package/dist/cli.js +3 -3
  33. package/dist/contract/index.d.ts +13 -13
  34. package/dist/contract/index.js +7 -7
  35. package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
  36. package/dist/control.d.ts +5 -5
  37. package/dist/control.js +3 -3
  38. package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
  39. package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
  40. package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
  41. package/dist/governance/index.d.ts +3 -3
  42. package/dist/hosted/index.d.ts +5 -5
  43. package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
  44. package/dist/{index-BIkvdkSU.d.ts → index-D9dwa00f.d.ts} +2 -2
  45. package/dist/index.d.ts +24 -132
  46. package/dist/index.js +16 -29
  47. package/dist/index.js.map +1 -1
  48. package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
  49. package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
  50. package/dist/meta-eval/index.d.ts +3 -3
  51. package/dist/openapi.json +1 -1
  52. package/dist/pipelines/index.js +3 -3
  53. package/dist/{provenance-BM8vmMBa.d.ts → provenance-D0WeCXt1.d.ts} +5 -5
  54. package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
  55. package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
  56. package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
  57. package/dist/reporting.d.ts +6 -6
  58. package/dist/reporting.js +4 -4
  59. package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
  60. package/dist/rl.d.ts +9 -9
  61. package/dist/rl.js +7 -7
  62. package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
  63. package/dist/run-campaign-HXPJAUZ3.js +10 -0
  64. package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
  65. package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
  66. package/dist/traces.d.ts +2 -2
  67. package/dist/traces.js +3 -3
  68. package/dist/{types-VCIXx_yo.d.ts → types-Beb6KPqZ.d.ts} +21 -1
  69. package/dist/wire/index.d.ts +3 -3
  70. package/dist/wire/index.js +3 -3
  71. package/package.json +12 -25
  72. package/dist/chunk-LBSXXH56.js.map +0 -1
  73. package/dist/chunk-NOPYCRNG.js.map +0 -1
  74. package/dist/chunk-QYJT52YW.js.map +0 -1
  75. package/dist/run-campaign-5XENUKRF.js +0 -10
  76. /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
  77. /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
  78. /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
  79. /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
  80. /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
  81. /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
  82. /package/dist/{chunk-GBHRUAOF.js.map → chunk-GMXHLSLL.js.map} +0 -0
  83. /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
  84. /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
  85. /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
  86. /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
  87. /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
  88. /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
  89. /package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
1
+ import { C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
2
2
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
3
3
  import { T as TraceStore } from './store-CKUAgsJz.js';
4
4
 
@@ -1,4 +1,4 @@
1
- import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
1
+ import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
2
2
  import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
3
3
 
4
4
  /**
@@ -2,9 +2,9 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
2
2
  import { R as Run } from '../schema-m0gsnbt3.js';
3
3
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
4
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
5
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-B3qNa4aY.js';
6
- import '../run-record-etiCMsUq.js';
7
- import '../errors-mje_cKOs.js';
5
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-DgBHWsh7.js';
6
+ import '../run-record-DgUVo5pw.js';
7
+ import '../errors-Dwqw-T_m.js';
8
8
 
9
9
  /**
10
10
  * Correlation study — "does our eval score predict real-world outcomes?"
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.60.0",
5
+ "version": "0.61.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -3,13 +3,13 @@ import {
3
3
  classifyFailure,
4
4
  compareToBaseline,
5
5
  computeToolUseMetrics
6
- } from "../chunk-QDOSODID.js";
6
+ } from "../chunk-3B7Y5AUR.js";
7
7
  import {
8
8
  buildTrajectory
9
9
  } from "../chunk-RZTMDUO7.js";
10
10
  import {
11
11
  interRaterReliability
12
- } from "../chunk-S3SDD56V.js";
12
+ } from "../chunk-ITBRCT73.js";
13
13
  import {
14
14
  aggregateLlm,
15
15
  argHash,
@@ -18,7 +18,7 @@ import {
18
18
  toolSpans
19
19
  } from "../chunk-47X6LRCE.js";
20
20
  import "../chunk-5BKGXME7.js";
21
- import "../chunk-QYJT52YW.js";
21
+ import "../chunk-3BFEG2F6.js";
22
22
  import "../chunk-PZ5AY32C.js";
23
23
 
24
24
  // src/pipelines/budget-breach.ts
@@ -1,8 +1,8 @@
1
- import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord, n as GateDecision } from './types-VCIXx_yo.js';
2
- import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
- import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
4
- import { R as RunRecord } from './run-record-etiCMsUq.js';
5
- import { H as HostedClient, T as TraceSpanEvent } from './index-BIkvdkSU.js';
1
+ import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-Beb6KPqZ.js';
2
+ import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
3
+ import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
4
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
5
+ import { H as HostedClient, T as TraceSpanEvent } from './index-D9dwa00f.js';
6
6
 
7
7
  /**
8
8
  * @experimental
@@ -1,4 +1,4 @@
1
- import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
1
+ import { a as DatasetScenario, b as Dataset } from './dataset-B2kL-fSM.js';
2
2
  import { T as TraceStore } from './store-CKUAgsJz.js';
3
3
 
4
4
  /**
@@ -1,5 +1,5 @@
1
- import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
2
- import { R as RunRecord } from './run-record-etiCMsUq.js';
1
+ import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
2
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
3
3
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
4
4
  import { J as JudgeInput } from './types-DhqpAi_z.js';
5
5
 
@@ -1,8 +1,8 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { a as JudgeScore } from './types-DhqpAi_z.js';
3
- import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
- import { m as GateDecision } from './summary-report-DLxh4yWk.js';
5
- import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
3
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
4
+ import { m as GateDecision } from './summary-report-BQvXpvaR.js';
5
+ import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
6
6
 
7
7
  /**
8
8
  * Release confidence gate.
@@ -1,14 +1,14 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-B3qNa4aY.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DmPjIce3.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-DgBHWsh7.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
5
- import './run-record-etiCMsUq.js';
6
- import './errors-mje_cKOs.js';
4
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
5
+ import './run-record-DgUVo5pw.js';
6
+ import './errors-Dwqw-T_m.js';
7
7
  import './schema-m0gsnbt3.js';
8
8
  import './outcome-store-D6KWmYvj.js';
9
9
  import './judge-calibration-DilmB3Ml.js';
10
10
  import './types-DhqpAi_z.js';
11
11
  import '@tangle-network/tcloud';
12
- import './dataset-BlwAtYYf.js';
12
+ import './dataset-B2kL-fSM.js';
13
13
  import './failure-cluster-CL7IVgkJ.js';
14
14
  import './store-CKUAgsJz.js';
package/dist/reporting.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  evaluateReleaseConfidence,
5
5
  judgeReplayGate,
6
6
  renderReleaseReport
7
- } from "./chunk-AIXHUIHG.js";
7
+ } from "./chunk-B26KI423.js";
8
8
  import {
9
9
  rubricPredictiveValidity
10
10
  } from "./chunk-YRZ4M5GS.js";
@@ -18,14 +18,14 @@ import {
18
18
  paretoChart,
19
19
  researchReport,
20
20
  summaryTable
21
- } from "./chunk-OLIBRKRD.js";
21
+ } from "./chunk-KX6F6NCG.js";
22
22
  import {
23
23
  benjaminiHochberg,
24
24
  pairedBootstrap,
25
25
  wilcoxonSignedRank
26
- } from "./chunk-S3SDD56V.js";
26
+ } from "./chunk-ITBRCT73.js";
27
27
  import "./chunk-VSMTAMNK.js";
28
- import "./chunk-QYJT52YW.js";
28
+ import "./chunk-3BFEG2F6.js";
29
29
  import "./chunk-PZ5AY32C.js";
30
30
  export {
31
31
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
@@ -1,8 +1,8 @@
1
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-etiCMsUq.js';
2
- import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
3
- import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-DLxh4yWk.js';
1
+ import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-DgUVo5pw.js';
2
+ import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BQvXpvaR.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
5
- import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CfXjSqEv.js';
5
+ import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
7
7
  import { F as FailureClass } from './schema-m0gsnbt3.js';
8
8
  import { T as TraceStore } from './store-CKUAgsJz.js';
package/dist/rl.d.ts CHANGED
@@ -1,20 +1,20 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
2
- import { k as CampaignResult } from './types-VCIXx_yo.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-JP8EvnLv.js';
4
- export { r as runEvalCampaign } from './researcher-JP8EvnLv.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
2
+ import { C as CampaignResult } from './types-Beb6KPqZ.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-BaVsy0sW.js';
4
+ export { r as runEvalCampaign } from './researcher-BaVsy0sW.js';
5
5
  import { S as Span } from './schema-m0gsnbt3.js';
6
6
  import { T as TraceStore } from './store-CKUAgsJz.js';
7
7
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
8
8
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
9
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-B3qNa4aY.js';
9
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-DgBHWsh7.js';
10
10
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
11
- import './errors-mje_cKOs.js';
12
- import './llm-client-BXVRUZyX.js';
11
+ import './errors-Dwqw-T_m.js';
12
+ import './llm-client-DbjLfz-K.js';
13
13
  import './raw-provider-sink-C46HDghv.js';
14
- import './summary-report-DLxh4yWk.js';
14
+ import './summary-report-BQvXpvaR.js';
15
15
  import './failure-cluster-CL7IVgkJ.js';
16
16
  import './emitter-DEZwY14K.js';
17
- import './integrity-CfXjSqEv.js';
17
+ import './integrity-CJzrpUua.js';
18
18
 
19
19
  /**
20
20
  * Test-time compute scaling curves.
package/dist/rl.js CHANGED
@@ -10,27 +10,27 @@ import {
10
10
  } from "./chunk-3RF76KTD.js";
11
11
  import {
12
12
  runEvalCampaign
13
- } from "./chunk-GM476SZU.js";
14
- import "./chunk-NCK5QLGT.js";
13
+ } from "./chunk-AIWHLG7J.js";
14
+ import "./chunk-F3SRAAZO.js";
15
15
  import {
16
16
  rubricPredictiveValidity
17
17
  } from "./chunk-YRZ4M5GS.js";
18
18
  import {
19
19
  evaluateInterimReleaseConfidence
20
20
  } from "./chunk-MAZ26DC7.js";
21
- import "./chunk-OLIBRKRD.js";
21
+ import "./chunk-KX6F6NCG.js";
22
22
  import {
23
23
  benjaminiHochberg,
24
24
  wilcoxonSignedRank
25
- } from "./chunk-S3SDD56V.js";
26
- import "./chunk-UBPIXOC4.js";
25
+ } from "./chunk-ITBRCT73.js";
26
+ import "./chunk-SBCB6VZY.js";
27
27
  import "./chunk-TVVP3ZZQ.js";
28
28
  import "./chunk-VSMTAMNK.js";
29
- import "./chunk-VXNVVBZO.js";
29
+ import "./chunk-IHDHUN2X.js";
30
30
  import "./chunk-PC4UYEBM.js";
31
31
  import {
32
32
  ValidationError
33
- } from "./chunk-QYJT52YW.js";
33
+ } from "./chunk-3BFEG2F6.js";
34
34
  import "./chunk-PZ5AY32C.js";
35
35
 
36
36
  // src/rl/compute-curves.ts
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-etiCMsUq.js';
1
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -0,0 +1,10 @@
1
+ import {
2
+ runCampaign
3
+ } from "./chunk-OLULBECP.js";
4
+ import "./chunk-ITBRCT73.js";
5
+ import "./chunk-3BFEG2F6.js";
6
+ import "./chunk-PZ5AY32C.js";
7
+ export {
8
+ runCampaign
9
+ };
10
+ //# sourceMappingURL=run-campaign-HXPJAUZ3.js.map
@@ -1,4 +1,4 @@
1
- import { V as ValidationError } from './errors-mje_cKOs.js';
1
+ import { V as ValidationError } from './errors-Dwqw-T_m.js';
2
2
  import { F as FailureClass } from './schema-m0gsnbt3.js';
3
3
 
4
4
  type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-etiCMsUq.js';
1
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
2
2
  import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
package/dist/traces.d.ts CHANGED
@@ -1,9 +1,9 @@
1
- import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
1
+ import { N as NotFoundError, R as ReplayError } from './errors-Dwqw-T_m.js';
2
2
  import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
3
3
  export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
4
4
  import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
5
5
  export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
6
- export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
6
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
7
7
  import { T as TraceStore } from './store-CKUAgsJz.js';
8
8
  export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
9
9
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
package/dist/traces.js CHANGED
@@ -34,7 +34,7 @@ import {
34
34
  tokenizeDomainWords,
35
35
  traceAnalystFunctionGroup,
36
36
  traceAnalystOnRunComplete
37
- } from "./chunk-PIEAE33T.js";
37
+ } from "./chunk-Z4ZCBC7M.js";
38
38
  import {
39
39
  DEFAULT_REDACTION_RULES,
40
40
  REDACTION_VERSION,
@@ -64,7 +64,7 @@ import {
64
64
  RunIntegrityError,
65
65
  assertRunCaptured,
66
66
  throwIfRunIncomplete
67
- } from "./chunk-UBPIXOC4.js";
67
+ } from "./chunk-SBCB6VZY.js";
68
68
  import {
69
69
  TraceEmitter,
70
70
  llmSpanFromProvider
@@ -77,7 +77,7 @@ import {
77
77
  defaultProviderRedactor,
78
78
  providerFromBaseUrl
79
79
  } from "./chunk-PC4UYEBM.js";
80
- import "./chunk-QYJT52YW.js";
80
+ import "./chunk-3BFEG2F6.js";
81
81
  import "./chunk-PZ5AY32C.js";
82
82
  export {
83
83
  DEFAULT_REDACTION_RULES,
@@ -257,12 +257,28 @@ interface CampaignArtifactWriter {
257
257
  write(path: string, content: string | Uint8Array): Promise<string>;
258
258
  writeJson(path: string, value: unknown): Promise<string>;
259
259
  }
260
+ /** Token usage accumulated for a cell. Structurally mirrors `RunTokenUsage`
261
+ * (run-record.ts) so a cell maps cleanly onto a `RunRecord` for the
262
+ * backend-integrity guard without coupling the campaign module to it. */
263
+ interface CampaignTokenUsage {
264
+ input: number;
265
+ output: number;
266
+ cached?: number;
267
+ }
260
268
  /** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
261
269
  * via the cost-ledger backend hooks; consumers can record additional
262
270
  * spend (sandbox time, tool costs) via `observe`. */
263
271
  interface CampaignCostMeter {
264
272
  observe(amountUsd: number, source: string): void;
273
+ /** Record LLM token usage for this cell; accumulates across calls. A cell
274
+ * has `costUsd` but no token counts unless the dispatch reports them here —
275
+ * and the backend-integrity guard (`assertRealBackend`) keys on
276
+ * `tokenUsage`, so a cell that never reports tokens reads as a stub. Any
277
+ * dispatch that calls an LLM MUST report its usage. */
278
+ observeTokens(usage: CampaignTokenUsage): void;
265
279
  current(): number;
280
+ /** Accumulated token usage for this cell (zeros if never observed). */
281
+ tokens(): CampaignTokenUsage;
266
282
  }
267
283
  /** @experimental Source tag — required on every store write. Used by the
268
284
  * default training-source filter (production-trace samples NOT used as
@@ -352,6 +368,10 @@ interface CampaignCellResult<TArtifact> {
352
368
  artifact: TArtifact;
353
369
  judgeScores: Record<string, JudgeScore>;
354
370
  costUsd: number;
371
+ /** LLM token usage the dispatch reported via `ctx.cost.observeTokens`.
372
+ * `{ input: 0, output: 0 }` when the dispatch reported none — which the
373
+ * backend-integrity guard reads as a stub. */
374
+ tokenUsage: CampaignTokenUsage;
355
375
  durationMs: number;
356
376
  seed: number;
357
377
  cached: boolean;
@@ -430,4 +450,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
430
450
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
431
451
  }
432
452
 
433
- export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ProposedCandidate as v, type ScenarioAggregate as w, type SessionScript as x, isProposedCandidate as y, labelTrustRank as z };
453
+ export { labelTrustRank as A, type CampaignResult as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type LabeledScenarioSource as g, type CodeSurface as h, type CampaignAggregates as i, type CampaignArtifactWriter as j, type CampaignCellResult as k, type CampaignCostMeter as l, type CampaignTokenUsage as m, type CampaignTraceWriter as n, type GateContext as o, type GateDecision as p, type GateResult as q, type GenerationCandidate as r, type GenerationRecord as s, type JudgeAggregate as t, type JudgeDimension as u, type Mutator as v, type ProposedCandidate as w, type ScenarioAggregate as x, type SessionScript as y, isProposedCandidate as z };
@@ -1,4 +1,4 @@
1
- import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-DpUmE90J.js';
1
+ import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-8hKC5EOb.js';
2
2
  import { T as TraceStore } from '../store-CKUAgsJz.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
@@ -8,8 +8,8 @@ import { Hono } from 'hono';
8
8
  import '../control-runtime-DuFBYg7A.js';
9
9
  import '../emitter-DEZwY14K.js';
10
10
  import '../schema-m0gsnbt3.js';
11
- import '../dataset-BlwAtYYf.js';
12
- import '../errors-mje_cKOs.js';
11
+ import '../dataset-B2kL-fSM.js';
12
+ import '../errors-Dwqw-T_m.js';
13
13
 
14
14
  declare const RubricDimensionSchema: z.ZodObject<{
15
15
  id: z.ZodString;
@@ -34,10 +34,10 @@ import {
34
34
  runRpcOnce,
35
35
  startServer,
36
36
  startServerAsync
37
- } from "../chunk-63EPZQUZ.js";
38
- import "../chunk-VXNVVBZO.js";
37
+ } from "../chunk-6REHLN5J.js";
38
+ import "../chunk-IHDHUN2X.js";
39
39
  import "../chunk-PC4UYEBM.js";
40
- import "../chunk-QYJT52YW.js";
40
+ import "../chunk-3BFEG2F6.js";
41
41
  import "../chunk-PZ5AY32C.js";
42
42
  export {
43
43
  BUILTIN_RUBRICS,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.60.0",
3
+ "version": "0.61.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -144,18 +144,6 @@
144
144
  "publishConfig": {
145
145
  "access": "public"
146
146
  },
147
- "scripts": {
148
- "build": "tsup && pnpm openapi",
149
- "dev": "tsup --watch",
150
- "prepare": "husky",
151
- "prepublishOnly": "pnpm build",
152
- "test": "vitest run",
153
- "test:watch": "vitest",
154
- "typecheck": "tsc --noEmit",
155
- "lint": "biome check src",
156
- "format": "biome format --write src",
157
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
158
- },
159
147
  "dependencies": {
160
148
  "@asteasolutions/zod-to-openapi": "^8.5.0",
161
149
  "@ax-llm/ax": "^19.0.25",
@@ -183,16 +171,6 @@
183
171
  "typescript": "^5.7.0",
184
172
  "vitest": "^3.0.0"
185
173
  },
186
- "pnpm": {
187
- "minimumReleaseAge": 4320,
188
- "minimumReleaseAgeExclude": [
189
- "@tangle-network/sandbox"
190
- ],
191
- "overrides": {
192
- "postcss@<8.5.10": "^8.5.10",
193
- "ws@>=8.0.0 <8.20.1": "^8.20.1"
194
- }
195
- },
196
174
  "engines": {
197
175
  "node": ">=20"
198
176
  },
@@ -205,5 +183,14 @@
205
183
  ]
206
184
  },
207
185
  "license": "MIT",
208
- "packageManager": "pnpm@10.22.0"
209
- }
186
+ "scripts": {
187
+ "build": "tsup && pnpm openapi",
188
+ "dev": "tsup --watch",
189
+ "test": "vitest run",
190
+ "test:watch": "vitest",
191
+ "typecheck": "tsc --noEmit",
192
+ "lint": "biome check src",
193
+ "format": "biome format --write src",
194
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
195
+ }
196
+ }