npm - @tangle-network/agent-eval - Versions diffs - 0.46.0 → 0.48.0 - Mend

@tangle-network/agent-eval 0.46.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/adapters/traceai.d.ts +109 -0
package/dist/adapters/traceai.js +106 -0
package/dist/adapters/traceai.js.map +1 -0
package/dist/campaign/index.d.ts +2 -2
package/dist/campaign/index.js +1 -1
package/dist/chunk-OYI6RZJK.js +80 -0
package/dist/chunk-OYI6RZJK.js.map +1 -0
package/dist/{chunk-HRKOCLQA.js → chunk-XAP6DJZE.js} +1 -1
package/dist/chunk-XAP6DJZE.js.map +1 -0
package/dist/contract/index.d.ts +21 -3
package/dist/contract/index.js +83 -3
package/dist/contract/index.js.map +1 -1
package/dist/hosted/index.d.ts +192 -0
package/dist/hosted/index.js +10 -0
package/dist/hosted/index.js.map +1 -0
package/dist/index.d.ts +6 -5
package/dist/index.js +30 -3
package/dist/index.js.map +1 -1
package/dist/matrix/index.d.ts +2 -2
package/dist/multishot/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} +1 -1
package/dist/reporting.d.ts +2 -2
package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} +1 -1
package/dist/rl.d.ts +3 -3
package/dist/rl.js +3 -1
package/dist/rl.js.map +1 -1
package/dist/{run-improvement-loop-Bfam3MT1.d.ts → run-improvement-loop-B-L8GgpW.d.ts} +1 -1
package/dist/{sequential-DdV5ShjT.d.ts → sequential-CbFH___X.d.ts} +23 -1
package/dist/{types-DHqkLwEU.d.ts → types-CqPax19X.d.ts} +1 -1
package/dist/verdict-CeEgtjyI.d.ts +32 -0
package/docs/adapters-observability.md +15 -0
package/docs/design/phase-d-rfc.md +125 -0
package/docs/design/substrate-gaps-2026-05-27.md +118 -0
package/docs/hosted-ingest-spec.md +204 -0
package/package.json +22 -31
package/dist/chunk-HRKOCLQA.js.map +0 -1

package/dist/matrix/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-DHqkLwEU.js';
-export { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
+import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-CqPax19X.js';
+export { D as DefaultVerdict } from '../verdict-CeEgtjyI.js';
 /**
  * Per-axis aggregation of cell runs into `AxisSummary` rows.

package/dist/multishot/index.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { AgentProfile } from '@tangle-network/sandbox';
-import { M as MatrixResult } from '../types-DHqkLwEU.js';
-import '@tangle-network/agent-runtime/loops';
+import { M as MatrixResult } from '../types-CqPax19X.js';
+import '../verdict-CeEgtjyI.js';
 interface MultishotMessage {
     role: 'user' | 'assistant' | 'tool';

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.45.0",
+    "version": "0.48.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} RENAMED Viewed

@@ -1,7 +1,7 @@
 import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
-import { w as GateDecision } from './sequential-DdV5ShjT.js';
+import { w as GateDecision } from './sequential-CbFH___X.js';
 import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
 interface Scenario {

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CJ08tGwq.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DBB8lB1P.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-CbFH___X.js';
 import './run-record-BGY6bHRh.js';
 import './errors-mje_cKOs.js';
 import './outcome-store-BxJ3DQKJ.js';

package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
 import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
-import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-DdV5ShjT.js';
+import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-CbFH___X.js';
 import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
 import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';

package/dist/rl.d.ts CHANGED Viewed

@@ -1,12 +1,12 @@
 import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
 import { d as CampaignResult } from './types-8u72Gc76.js';
-import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
-export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
+import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CHMO56K0.js';
+export { r as runEvalCampaign } from './researcher-CHMO56K0.js';
 import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
 import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-BxJ3DQKJ.js';
 import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CJ08tGwq.js';
-import { I as InterimReleaseConfidence } from './sequential-DdV5ShjT.js';
+import { I as InterimReleaseConfidence } from './sequential-CbFH___X.js';
 import './errors-mje_cKOs.js';
 import './llm-client-BXVRUZyX.js';
 import './raw-provider-sink-C46HDghv.js';

package/dist/rl.js CHANGED Viewed

@@ -1311,7 +1311,9 @@ var PredictiveValidityResearcher = class {
         searchScore: 0,
         holdoutScore: 0,
         overfitGap: 0,
-        baselineOverfitGap: 0
+        baselineOverfitGap: 0,
+        medianCandidateCost: Number.NaN,
+        medianBaselineCost: Number.NaN
       },
       reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
       rejectionCode: "few_runs"