@tangle-network/agent-eval 0.47.0 → 0.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -0
- package/dist/adapters/otel.d.ts +103 -0
- package/dist/adapters/otel.js +110 -0
- package/dist/adapters/otel.js.map +1 -0
- package/dist/campaign/index.d.ts +2 -2
- package/dist/campaign/index.js +1 -1
- package/dist/{chunk-ZQABFCVJ.js → chunk-OYI6RZJK.js} +9 -14
- package/dist/chunk-OYI6RZJK.js.map +1 -0
- package/dist/{chunk-HRKOCLQA.js → chunk-XAP6DJZE.js} +1 -1
- package/dist/chunk-XAP6DJZE.js.map +1 -0
- package/dist/contract/index.d.ts +3 -3
- package/dist/contract/index.js +4 -4
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.js +1 -1
- package/dist/index.d.ts +6 -5
- package/dist/index.js +30 -3
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -2
- package/dist/multishot/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/rl.js +3 -1
- package/dist/rl.js.map +1 -1
- package/dist/{run-improvement-loop-Bfam3MT1.d.ts → run-improvement-loop-B-L8GgpW.d.ts} +1 -1
- package/dist/{sequential-DdV5ShjT.d.ts → sequential-CbFH___X.d.ts} +23 -1
- package/dist/{types-DHqkLwEU.d.ts → types-CqPax19X.d.ts} +1 -1
- package/dist/verdict-CeEgtjyI.d.ts +32 -0
- package/docs/adapters-observability.md +15 -0
- package/docs/design/substrate-gaps.md +118 -0
- package/package.json +17 -31
- package/dist/chunk-HRKOCLQA.js.map +0 -1
- package/dist/chunk-ZQABFCVJ.js.map +0 -1
package/dist/matrix/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-
|
|
2
|
-
export { DefaultVerdict } from '
|
|
1
|
+
import { M as MatrixResult, a as MatrixAxis, A as AxisSummary, b as MatrixCell, C as CellResult, R as RunAgentMatrixOptions } from '../types-CqPax19X.js';
|
|
2
|
+
export { D as DefaultVerdict } from '../verdict-CeEgtjyI.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Per-axis aggregation of cell runs into `AxisSummary` rows.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { AgentProfile } from '@tangle-network/sandbox';
|
|
2
|
-
import { M as MatrixResult } from '../types-
|
|
3
|
-
import '
|
|
2
|
+
import { M as MatrixResult } from '../types-CqPax19X.js';
|
|
3
|
+
import '../verdict-CeEgtjyI.js';
|
|
4
4
|
|
|
5
5
|
interface MultishotMessage {
|
|
6
6
|
role: 'user' | 'assistant' | 'tool';
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.49.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { TCloud } from '@tangle-network/tcloud';
|
|
3
3
|
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
4
|
-
import { w as GateDecision } from './sequential-
|
|
4
|
+
import { w as GateDecision } from './sequential-CbFH___X.js';
|
|
5
5
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
6
6
|
|
|
7
7
|
interface Scenario {
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CJ08tGwq.js';
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
3
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DBB8lB1P.js';
|
|
3
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-CbFH___X.js';
|
|
4
4
|
import './run-record-BGY6bHRh.js';
|
|
5
5
|
import './errors-mje_cKOs.js';
|
|
6
6
|
import './outcome-store-BxJ3DQKJ.js';
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
2
|
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
|
|
3
|
-
import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-
|
|
3
|
+
import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-CbFH___X.js';
|
|
4
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
5
5
|
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
2
|
import { d as CampaignResult } from './types-8u72Gc76.js';
|
|
3
|
-
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CHMO56K0.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-CHMO56K0.js';
|
|
5
5
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
6
6
|
import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
|
|
7
7
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-BxJ3DQKJ.js';
|
|
8
8
|
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CJ08tGwq.js';
|
|
9
|
-
import { I as InterimReleaseConfidence } from './sequential-
|
|
9
|
+
import { I as InterimReleaseConfidence } from './sequential-CbFH___X.js';
|
|
10
10
|
import './errors-mje_cKOs.js';
|
|
11
11
|
import './llm-client-BXVRUZyX.js';
|
|
12
12
|
import './raw-provider-sink-C46HDghv.js';
|
package/dist/rl.js
CHANGED
|
@@ -1311,7 +1311,9 @@ var PredictiveValidityResearcher = class {
|
|
|
1311
1311
|
searchScore: 0,
|
|
1312
1312
|
holdoutScore: 0,
|
|
1313
1313
|
overfitGap: 0,
|
|
1314
|
-
baselineOverfitGap: 0
|
|
1314
|
+
baselineOverfitGap: 0,
|
|
1315
|
+
medianCandidateCost: Number.NaN,
|
|
1316
|
+
medianBaselineCost: Number.NaN
|
|
1315
1317
|
},
|
|
1316
1318
|
reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
|
|
1317
1319
|
rejectionCode: "few_runs"
|