@tangle-network/agent-eval 0.60.0 → 0.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +5 -5
- package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/campaign/index.d.ts +151 -11
- package/dist/campaign/index.js +211 -10
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
- package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
- package/dist/chunk-3BFEG2F6.js.map +1 -0
- package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
- package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
- package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
- package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
- package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
- package/dist/{chunk-GBHRUAOF.js → chunk-GMXHLSLL.js} +2 -2
- package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
- package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
- package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
- package/dist/{chunk-NOPYCRNG.js → chunk-OLULBECP.js} +13 -2
- package/dist/chunk-OLULBECP.js.map +1 -0
- package/dist/chunk-PQV2TKC3.js +27 -0
- package/dist/chunk-PQV2TKC3.js.map +1 -0
- package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
- package/dist/{chunk-LBSXXH56.js → chunk-SUGME4OT.js} +5 -5
- package/dist/chunk-SUGME4OT.js.map +1 -0
- package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
- package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
- package/dist/cli.js +3 -3
- package/dist/contract/index.d.ts +13 -13
- package/dist/contract/index.js +7 -7
- package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +3 -3
- package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
- package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
- package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -3
- package/dist/hosted/index.d.ts +5 -5
- package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
- package/dist/{index-BIkvdkSU.d.ts → index-D9dwa00f.d.ts} +2 -2
- package/dist/index.d.ts +24 -132
- package/dist/index.js +16 -29
- package/dist/index.js.map +1 -1
- package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
- package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +3 -3
- package/dist/{provenance-BM8vmMBa.d.ts → provenance-D0WeCXt1.d.ts} +5 -5
- package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
- package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
- package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
- package/dist/reporting.d.ts +6 -6
- package/dist/reporting.js +4 -4
- package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
- package/dist/rl.d.ts +9 -9
- package/dist/rl.js +7 -7
- package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
- package/dist/run-campaign-HXPJAUZ3.js +10 -0
- package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
- package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +3 -3
- package/dist/{types-VCIXx_yo.d.ts → types-Beb6KPqZ.d.ts} +21 -1
- package/dist/wire/index.d.ts +3 -3
- package/dist/wire/index.js +3 -3
- package/package.json +12 -25
- package/dist/chunk-LBSXXH56.js.map +0 -1
- package/dist/chunk-NOPYCRNG.js.map +0 -1
- package/dist/chunk-QYJT52YW.js.map +0 -1
- package/dist/run-campaign-5XENUKRF.js +0 -10
- /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
- /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
- /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
- /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
- /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
- /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
- /package/dist/{chunk-GBHRUAOF.js.map → chunk-GMXHLSLL.js.map} +0 -0
- /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
- /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
- /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
- /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
- /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
- /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
- /package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
|
|
2
2
|
import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -2,9 +2,9 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
|
2
2
|
import { R as Run } from '../schema-m0gsnbt3.js';
|
|
3
3
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
4
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
5
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
6
|
-
import '../run-record-
|
|
7
|
-
import '../errors-
|
|
5
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-DgBHWsh7.js';
|
|
6
|
+
import '../run-record-DgUVo5pw.js';
|
|
7
|
+
import '../errors-Dwqw-T_m.js';
|
|
8
8
|
|
|
9
9
|
/**
|
|
10
10
|
* Correlation study — "does our eval score predict real-world outcomes?"
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.61.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/pipelines/index.js
CHANGED
|
@@ -3,13 +3,13 @@ import {
|
|
|
3
3
|
classifyFailure,
|
|
4
4
|
compareToBaseline,
|
|
5
5
|
computeToolUseMetrics
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-3B7Y5AUR.js";
|
|
7
7
|
import {
|
|
8
8
|
buildTrajectory
|
|
9
9
|
} from "../chunk-RZTMDUO7.js";
|
|
10
10
|
import {
|
|
11
11
|
interRaterReliability
|
|
12
|
-
} from "../chunk-
|
|
12
|
+
} from "../chunk-ITBRCT73.js";
|
|
13
13
|
import {
|
|
14
14
|
aggregateLlm,
|
|
15
15
|
argHash,
|
|
@@ -18,7 +18,7 @@ import {
|
|
|
18
18
|
toolSpans
|
|
19
19
|
} from "../chunk-47X6LRCE.js";
|
|
20
20
|
import "../chunk-5BKGXME7.js";
|
|
21
|
-
import "../chunk-
|
|
21
|
+
import "../chunk-3BFEG2F6.js";
|
|
22
22
|
import "../chunk-PZ5AY32C.js";
|
|
23
23
|
|
|
24
24
|
// src/pipelines/budget-breach.ts
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { S as Scenario,
|
|
2
|
-
import { L as LlmClientOptions } from './llm-client-
|
|
3
|
-
import { R as RedTeamCase } from './red-team-
|
|
4
|
-
import { R as RunRecord } from './run-record-
|
|
5
|
-
import { H as HostedClient, T as TraceSpanEvent } from './index-
|
|
1
|
+
import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-Beb6KPqZ.js';
|
|
2
|
+
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
3
|
+
import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
|
|
4
|
+
import { R as RunRecord } from './run-record-DgUVo5pw.js';
|
|
5
|
+
import { H as HostedClient, T as TraceSpanEvent } from './index-D9dwa00f.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* @experimental
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-
|
|
2
|
-
import { R as RunRecord } from './run-record-
|
|
1
|
+
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
|
|
2
|
+
import { R as RunRecord } from './run-record-DgUVo5pw.js';
|
|
3
3
|
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
4
4
|
import { J as JudgeInput } from './types-DhqpAi_z.js';
|
|
5
5
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { a as JudgeScore } from './types-DhqpAi_z.js';
|
|
3
|
-
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-
|
|
4
|
-
import { m as GateDecision } from './summary-report-
|
|
5
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
3
|
+
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
|
|
4
|
+
import { m as GateDecision } from './summary-report-BQvXpvaR.js';
|
|
5
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-DgBHWsh7.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
6
|
-
import './errors-
|
|
4
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
|
|
5
|
+
import './run-record-DgUVo5pw.js';
|
|
6
|
+
import './errors-Dwqw-T_m.js';
|
|
7
7
|
import './schema-m0gsnbt3.js';
|
|
8
8
|
import './outcome-store-D6KWmYvj.js';
|
|
9
9
|
import './judge-calibration-DilmB3Ml.js';
|
|
10
10
|
import './types-DhqpAi_z.js';
|
|
11
11
|
import '@tangle-network/tcloud';
|
|
12
|
-
import './dataset-
|
|
12
|
+
import './dataset-B2kL-fSM.js';
|
|
13
13
|
import './failure-cluster-CL7IVgkJ.js';
|
|
14
14
|
import './store-CKUAgsJz.js';
|
package/dist/reporting.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
evaluateReleaseConfidence,
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
renderReleaseReport
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-B26KI423.js";
|
|
8
8
|
import {
|
|
9
9
|
rubricPredictiveValidity
|
|
10
10
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -18,14 +18,14 @@ import {
|
|
|
18
18
|
paretoChart,
|
|
19
19
|
researchReport,
|
|
20
20
|
summaryTable
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-KX6F6NCG.js";
|
|
22
22
|
import {
|
|
23
23
|
benjaminiHochberg,
|
|
24
24
|
pairedBootstrap,
|
|
25
25
|
wilcoxonSignedRank
|
|
26
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-ITBRCT73.js";
|
|
27
27
|
import "./chunk-VSMTAMNK.js";
|
|
28
|
-
import "./chunk-
|
|
28
|
+
import "./chunk-3BFEG2F6.js";
|
|
29
29
|
import "./chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
31
31
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-
|
|
2
|
-
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-
|
|
3
|
-
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-
|
|
1
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-DgUVo5pw.js';
|
|
2
|
+
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
|
|
3
|
+
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BQvXpvaR.js';
|
|
4
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
|
|
5
|
-
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-
|
|
5
|
+
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
7
7
|
import { F as FailureClass } from './schema-m0gsnbt3.js';
|
|
8
8
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
2
|
-
import {
|
|
3
|
-
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
|
|
2
|
+
import { C as CampaignResult } from './types-Beb6KPqZ.js';
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-BaVsy0sW.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-BaVsy0sW.js';
|
|
5
5
|
import { S as Span } from './schema-m0gsnbt3.js';
|
|
6
6
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
7
7
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
8
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
9
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
9
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-DgBHWsh7.js';
|
|
10
10
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
11
|
-
import './errors-
|
|
12
|
-
import './llm-client-
|
|
11
|
+
import './errors-Dwqw-T_m.js';
|
|
12
|
+
import './llm-client-DbjLfz-K.js';
|
|
13
13
|
import './raw-provider-sink-C46HDghv.js';
|
|
14
|
-
import './summary-report-
|
|
14
|
+
import './summary-report-BQvXpvaR.js';
|
|
15
15
|
import './failure-cluster-CL7IVgkJ.js';
|
|
16
16
|
import './emitter-DEZwY14K.js';
|
|
17
|
-
import './integrity-
|
|
17
|
+
import './integrity-CJzrpUua.js';
|
|
18
18
|
|
|
19
19
|
/**
|
|
20
20
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -10,27 +10,27 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
14
|
-
import "./chunk-
|
|
13
|
+
} from "./chunk-AIWHLG7J.js";
|
|
14
|
+
import "./chunk-F3SRAAZO.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
17
17
|
} from "./chunk-YRZ4M5GS.js";
|
|
18
18
|
import {
|
|
19
19
|
evaluateInterimReleaseConfidence
|
|
20
20
|
} from "./chunk-MAZ26DC7.js";
|
|
21
|
-
import "./chunk-
|
|
21
|
+
import "./chunk-KX6F6NCG.js";
|
|
22
22
|
import {
|
|
23
23
|
benjaminiHochberg,
|
|
24
24
|
wilcoxonSignedRank
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import "./chunk-
|
|
25
|
+
} from "./chunk-ITBRCT73.js";
|
|
26
|
+
import "./chunk-SBCB6VZY.js";
|
|
27
27
|
import "./chunk-TVVP3ZZQ.js";
|
|
28
28
|
import "./chunk-VSMTAMNK.js";
|
|
29
|
-
import "./chunk-
|
|
29
|
+
import "./chunk-IHDHUN2X.js";
|
|
30
30
|
import "./chunk-PC4UYEBM.js";
|
|
31
31
|
import {
|
|
32
32
|
ValidationError
|
|
33
|
-
} from "./chunk-
|
|
33
|
+
} from "./chunk-3BFEG2F6.js";
|
|
34
34
|
import "./chunk-PZ5AY32C.js";
|
|
35
35
|
|
|
36
36
|
// src/rl/compute-curves.ts
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { N as NotFoundError, R as ReplayError } from './errors-
|
|
1
|
+
import { N as NotFoundError, R as ReplayError } from './errors-Dwqw-T_m.js';
|
|
2
2
|
import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
3
|
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
4
4
|
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
|
|
6
|
-
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-
|
|
6
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
|
|
7
7
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
8
8
|
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
9
9
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
package/dist/traces.js
CHANGED
|
@@ -34,7 +34,7 @@ import {
|
|
|
34
34
|
tokenizeDomainWords,
|
|
35
35
|
traceAnalystFunctionGroup,
|
|
36
36
|
traceAnalystOnRunComplete
|
|
37
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-Z4ZCBC7M.js";
|
|
38
38
|
import {
|
|
39
39
|
DEFAULT_REDACTION_RULES,
|
|
40
40
|
REDACTION_VERSION,
|
|
@@ -64,7 +64,7 @@ import {
|
|
|
64
64
|
RunIntegrityError,
|
|
65
65
|
assertRunCaptured,
|
|
66
66
|
throwIfRunIncomplete
|
|
67
|
-
} from "./chunk-
|
|
67
|
+
} from "./chunk-SBCB6VZY.js";
|
|
68
68
|
import {
|
|
69
69
|
TraceEmitter,
|
|
70
70
|
llmSpanFromProvider
|
|
@@ -77,7 +77,7 @@ import {
|
|
|
77
77
|
defaultProviderRedactor,
|
|
78
78
|
providerFromBaseUrl
|
|
79
79
|
} from "./chunk-PC4UYEBM.js";
|
|
80
|
-
import "./chunk-
|
|
80
|
+
import "./chunk-3BFEG2F6.js";
|
|
81
81
|
import "./chunk-PZ5AY32C.js";
|
|
82
82
|
export {
|
|
83
83
|
DEFAULT_REDACTION_RULES,
|
|
@@ -257,12 +257,28 @@ interface CampaignArtifactWriter {
|
|
|
257
257
|
write(path: string, content: string | Uint8Array): Promise<string>;
|
|
258
258
|
writeJson(path: string, value: unknown): Promise<string>;
|
|
259
259
|
}
|
|
260
|
+
/** Token usage accumulated for a cell. Structurally mirrors `RunTokenUsage`
|
|
261
|
+
* (run-record.ts) so a cell maps cleanly onto a `RunRecord` for the
|
|
262
|
+
* backend-integrity guard without coupling the campaign module to it. */
|
|
263
|
+
interface CampaignTokenUsage {
|
|
264
|
+
input: number;
|
|
265
|
+
output: number;
|
|
266
|
+
cached?: number;
|
|
267
|
+
}
|
|
260
268
|
/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
|
|
261
269
|
* via the cost-ledger backend hooks; consumers can record additional
|
|
262
270
|
* spend (sandbox time, tool costs) via `observe`. */
|
|
263
271
|
interface CampaignCostMeter {
|
|
264
272
|
observe(amountUsd: number, source: string): void;
|
|
273
|
+
/** Record LLM token usage for this cell; accumulates across calls. A cell
|
|
274
|
+
* has `costUsd` but no token counts unless the dispatch reports them here —
|
|
275
|
+
* and the backend-integrity guard (`assertRealBackend`) keys on
|
|
276
|
+
* `tokenUsage`, so a cell that never reports tokens reads as a stub. Any
|
|
277
|
+
* dispatch that calls an LLM MUST report its usage. */
|
|
278
|
+
observeTokens(usage: CampaignTokenUsage): void;
|
|
265
279
|
current(): number;
|
|
280
|
+
/** Accumulated token usage for this cell (zeros if never observed). */
|
|
281
|
+
tokens(): CampaignTokenUsage;
|
|
266
282
|
}
|
|
267
283
|
/** @experimental Source tag — required on every store write. Used by the
|
|
268
284
|
* default training-source filter (production-trace samples NOT used as
|
|
@@ -352,6 +368,10 @@ interface CampaignCellResult<TArtifact> {
|
|
|
352
368
|
artifact: TArtifact;
|
|
353
369
|
judgeScores: Record<string, JudgeScore>;
|
|
354
370
|
costUsd: number;
|
|
371
|
+
/** LLM token usage the dispatch reported via `ctx.cost.observeTokens`.
|
|
372
|
+
* `{ input: 0, output: 0 }` when the dispatch reported none — which the
|
|
373
|
+
* backend-integrity guard reads as a stub. */
|
|
374
|
+
tokenUsage: CampaignTokenUsage;
|
|
355
375
|
durationMs: number;
|
|
356
376
|
seed: number;
|
|
357
377
|
cached: boolean;
|
|
@@ -430,4 +450,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
430
450
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
431
451
|
}
|
|
432
452
|
|
|
433
|
-
export { type
|
|
453
|
+
export { labelTrustRank as A, type CampaignResult as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type LabeledScenarioSource as g, type CodeSurface as h, type CampaignAggregates as i, type CampaignArtifactWriter as j, type CampaignCellResult as k, type CampaignCostMeter as l, type CampaignTokenUsage as m, type CampaignTraceWriter as n, type GateContext as o, type GateDecision as p, type GateResult as q, type GenerationCandidate as r, type GenerationRecord as s, type JudgeAggregate as t, type JudgeDimension as u, type Mutator as v, type ProposedCandidate as w, type ScenarioAggregate as x, type SessionScript as y, isProposedCandidate as z };
|
package/dist/wire/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-
|
|
1
|
+
import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-8hKC5EOb.js';
|
|
2
2
|
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { OpenAPIObject } from 'openapi3-ts/oas31';
|
|
@@ -8,8 +8,8 @@ import { Hono } from 'hono';
|
|
|
8
8
|
import '../control-runtime-DuFBYg7A.js';
|
|
9
9
|
import '../emitter-DEZwY14K.js';
|
|
10
10
|
import '../schema-m0gsnbt3.js';
|
|
11
|
-
import '../dataset-
|
|
12
|
-
import '../errors-
|
|
11
|
+
import '../dataset-B2kL-fSM.js';
|
|
12
|
+
import '../errors-Dwqw-T_m.js';
|
|
13
13
|
|
|
14
14
|
declare const RubricDimensionSchema: z.ZodObject<{
|
|
15
15
|
id: z.ZodString;
|
package/dist/wire/index.js
CHANGED
|
@@ -34,10 +34,10 @@ import {
|
|
|
34
34
|
runRpcOnce,
|
|
35
35
|
startServer,
|
|
36
36
|
startServerAsync
|
|
37
|
-
} from "../chunk-
|
|
38
|
-
import "../chunk-
|
|
37
|
+
} from "../chunk-6REHLN5J.js";
|
|
38
|
+
import "../chunk-IHDHUN2X.js";
|
|
39
39
|
import "../chunk-PC4UYEBM.js";
|
|
40
|
-
import "../chunk-
|
|
40
|
+
import "../chunk-3BFEG2F6.js";
|
|
41
41
|
import "../chunk-PZ5AY32C.js";
|
|
42
42
|
export {
|
|
43
43
|
BUILTIN_RUBRICS,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.61.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -144,18 +144,6 @@
|
|
|
144
144
|
"publishConfig": {
|
|
145
145
|
"access": "public"
|
|
146
146
|
},
|
|
147
|
-
"scripts": {
|
|
148
|
-
"build": "tsup && pnpm openapi",
|
|
149
|
-
"dev": "tsup --watch",
|
|
150
|
-
"prepare": "husky",
|
|
151
|
-
"prepublishOnly": "pnpm build",
|
|
152
|
-
"test": "vitest run",
|
|
153
|
-
"test:watch": "vitest",
|
|
154
|
-
"typecheck": "tsc --noEmit",
|
|
155
|
-
"lint": "biome check src",
|
|
156
|
-
"format": "biome format --write src",
|
|
157
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
158
|
-
},
|
|
159
147
|
"dependencies": {
|
|
160
148
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
161
149
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -183,16 +171,6 @@
|
|
|
183
171
|
"typescript": "^5.7.0",
|
|
184
172
|
"vitest": "^3.0.0"
|
|
185
173
|
},
|
|
186
|
-
"pnpm": {
|
|
187
|
-
"minimumReleaseAge": 4320,
|
|
188
|
-
"minimumReleaseAgeExclude": [
|
|
189
|
-
"@tangle-network/sandbox"
|
|
190
|
-
],
|
|
191
|
-
"overrides": {
|
|
192
|
-
"postcss@<8.5.10": "^8.5.10",
|
|
193
|
-
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
194
|
-
}
|
|
195
|
-
},
|
|
196
174
|
"engines": {
|
|
197
175
|
"node": ">=20"
|
|
198
176
|
},
|
|
@@ -205,5 +183,14 @@
|
|
|
205
183
|
]
|
|
206
184
|
},
|
|
207
185
|
"license": "MIT",
|
|
208
|
-
"
|
|
209
|
-
|
|
186
|
+
"scripts": {
|
|
187
|
+
"build": "tsup && pnpm openapi",
|
|
188
|
+
"dev": "tsup --watch",
|
|
189
|
+
"test": "vitest run",
|
|
190
|
+
"test:watch": "vitest",
|
|
191
|
+
"typecheck": "tsc --noEmit",
|
|
192
|
+
"lint": "biome check src",
|
|
193
|
+
"format": "biome format --write src",
|
|
194
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
195
|
+
}
|
|
196
|
+
}
|