@tangle-network/agent-eval 0.46.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/adapters/traceai.d.ts +109 -0
  2. package/dist/adapters/traceai.js +106 -0
  3. package/dist/adapters/traceai.js.map +1 -0
  4. package/dist/campaign/index.d.ts +2 -2
  5. package/dist/campaign/index.js +1 -1
  6. package/dist/chunk-OYI6RZJK.js +80 -0
  7. package/dist/chunk-OYI6RZJK.js.map +1 -0
  8. package/dist/{chunk-HRKOCLQA.js → chunk-XAP6DJZE.js} +1 -1
  9. package/dist/chunk-XAP6DJZE.js.map +1 -0
  10. package/dist/contract/index.d.ts +21 -3
  11. package/dist/contract/index.js +83 -3
  12. package/dist/contract/index.js.map +1 -1
  13. package/dist/hosted/index.d.ts +192 -0
  14. package/dist/hosted/index.js +10 -0
  15. package/dist/hosted/index.js.map +1 -0
  16. package/dist/index.d.ts +6 -5
  17. package/dist/index.js +30 -3
  18. package/dist/index.js.map +1 -1
  19. package/dist/matrix/index.d.ts +2 -2
  20. package/dist/multishot/index.d.ts +2 -2
  21. package/dist/openapi.json +1 -1
  22. package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} +1 -1
  23. package/dist/reporting.d.ts +2 -2
  24. package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} +1 -1
  25. package/dist/rl.d.ts +3 -3
  26. package/dist/rl.js +3 -1
  27. package/dist/rl.js.map +1 -1
  28. package/dist/{run-improvement-loop-Bfam3MT1.d.ts → run-improvement-loop-B-L8GgpW.d.ts} +1 -1
  29. package/dist/{sequential-DdV5ShjT.d.ts → sequential-CbFH___X.d.ts} +23 -1
  30. package/dist/{types-DHqkLwEU.d.ts → types-CqPax19X.d.ts} +1 -1
  31. package/dist/verdict-CeEgtjyI.d.ts +32 -0
  32. package/docs/adapters-observability.md +15 -0
  33. package/docs/design/phase-d-rfc.md +125 -0
  34. package/docs/design/substrate-gaps-2026-05-27.md +118 -0
  35. package/docs/hosted-ingest-spec.md +204 -0
  36. package/package.json +22 -31
  37. package/dist/chunk-HRKOCLQA.js.map +0 -1
@@ -1,5 +1,5 @@
1
- import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-DHqkLwEU.js';
2
- export { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
1
+ import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-CqPax19X.js';
2
+ export { D as DefaultVerdict } from '../verdict-CeEgtjyI.js';
3
3
 
4
4
  /**
5
5
  * Per-axis aggregation of cell runs into `AxisSummary` rows.
@@ -1,6 +1,6 @@
1
1
  import { AgentProfile } from '@tangle-network/sandbox';
2
- import { M as MatrixResult } from '../types-DHqkLwEU.js';
3
- import '@tangle-network/agent-runtime/loops';
2
+ import { M as MatrixResult } from '../types-CqPax19X.js';
3
+ import '../verdict-CeEgtjyI.js';
4
4
 
5
5
  interface MultishotMessage {
6
6
  role: 'user' | 'assistant' | 'tool';
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.45.0",
5
+ "version": "0.48.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,7 +1,7 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { TCloud } from '@tangle-network/tcloud';
3
3
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
- import { w as GateDecision } from './sequential-DdV5ShjT.js';
4
+ import { w as GateDecision } from './sequential-CbFH___X.js';
5
5
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
6
6
 
7
7
  interface Scenario {
@@ -1,6 +1,6 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CJ08tGwq.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
3
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DBB8lB1P.js';
3
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-CbFH___X.js';
4
4
  import './run-record-BGY6bHRh.js';
5
5
  import './errors-mje_cKOs.js';
6
6
  import './outcome-store-BxJ3DQKJ.js';
@@ -1,6 +1,6 @@
1
1
  import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
3
- import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-DdV5ShjT.js';
3
+ import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-CbFH___X.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
5
5
  import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
package/dist/rl.d.ts CHANGED
@@ -1,12 +1,12 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
2
  import { d as CampaignResult } from './types-8u72Gc76.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
- export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CHMO56K0.js';
4
+ export { r as runEvalCampaign } from './researcher-CHMO56K0.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
6
6
  import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
7
7
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-BxJ3DQKJ.js';
8
8
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CJ08tGwq.js';
9
- import { I as InterimReleaseConfidence } from './sequential-DdV5ShjT.js';
9
+ import { I as InterimReleaseConfidence } from './sequential-CbFH___X.js';
10
10
  import './errors-mje_cKOs.js';
11
11
  import './llm-client-BXVRUZyX.js';
12
12
  import './raw-provider-sink-C46HDghv.js';
package/dist/rl.js CHANGED
@@ -1311,7 +1311,9 @@ var PredictiveValidityResearcher = class {
1311
1311
  searchScore: 0,
1312
1312
  holdoutScore: 0,
1313
1313
  overfitGap: 0,
1314
- baselineOverfitGap: 0
1314
+ baselineOverfitGap: 0,
1315
+ medianCandidateCost: Number.NaN,
1316
+ medianBaselineCost: Number.NaN
1315
1317
  },
1316
1318
  reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
1317
1319
  rejectionCode: "few_runs"