@tangle-network/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +236 -1
- package/README.md +17 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/chunk-7EAUOUQS.js +495 -0
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
- package/dist/cli.js +3 -3
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +20 -430
- package/dist/index.js +154 -34
- package/dist/index.js.map +1 -1
- package/dist/integrity-Cr5YodSY.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +7 -145
- package/dist/optimization.js +12 -3
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +18 -9
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +19 -8
- package/dist/wire/index.js +3 -3
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/research-report-methodology.md +19 -4
- package/docs/three-package-architecture.md +180 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +7 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
- /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
stopOnNoProgress,
|
|
20
20
|
stopOnRepeatedAction,
|
|
21
21
|
subjectiveEval
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-V5QSWN7L.js";
|
|
23
23
|
import {
|
|
24
24
|
CallbackResearcher,
|
|
25
25
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -53,49 +53,111 @@ import {
|
|
|
53
53
|
summarizePreferenceMemory,
|
|
54
54
|
trialTraceFromMultiShotTrial,
|
|
55
55
|
withAssignedFeedbackSplit
|
|
56
|
-
} from "./chunk-
|
|
56
|
+
} from "./chunk-VQQSPGSM.js";
|
|
57
57
|
import {
|
|
58
58
|
RunRecordValidationError,
|
|
59
59
|
isRunRecord,
|
|
60
60
|
parseRunRecordSafe,
|
|
61
61
|
roundTripRunRecord,
|
|
62
62
|
validateRunRecord
|
|
63
|
-
} from "./chunk-
|
|
63
|
+
} from "./chunk-QBW3YBTR.js";
|
|
64
64
|
import {
|
|
65
|
-
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
66
65
|
assertReleaseConfidence,
|
|
67
66
|
bootstrapCi,
|
|
68
|
-
canonicalize,
|
|
69
|
-
evaluateHypothesis,
|
|
70
67
|
evaluateReleaseConfidence,
|
|
71
|
-
gainHistogram,
|
|
72
|
-
hashJson,
|
|
73
68
|
judgeReplayGate,
|
|
74
|
-
paretoChart,
|
|
75
69
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
76
|
-
renderReleaseReport
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
70
|
+
renderReleaseReport
|
|
71
|
+
} from "./chunk-7EAUOUQS.js";
|
|
72
|
+
import {
|
|
73
|
+
PredictiveValidityResearcher,
|
|
74
|
+
adversarialScenarioSearch,
|
|
75
|
+
analyzeOptimizationResult,
|
|
76
|
+
applyEloUpdate,
|
|
77
|
+
bestOfN,
|
|
78
|
+
buildPairwiseFromCampaign,
|
|
79
|
+
compareAdaptationCurves,
|
|
80
|
+
detectRewardHacking,
|
|
81
|
+
doublyRobust,
|
|
82
|
+
extractPreferences,
|
|
83
|
+
extractStepRewards,
|
|
84
|
+
extractVerifiableReward,
|
|
85
|
+
extractVerifiableRewardsFromRecords,
|
|
86
|
+
filterDeterministicallyRewarded,
|
|
87
|
+
firstPassK,
|
|
88
|
+
fitBradleyTerry,
|
|
89
|
+
injectIrrelevantClause,
|
|
90
|
+
inverseProbabilityWeighting,
|
|
91
|
+
observationsFromRunRecords,
|
|
92
|
+
offPolicyEstimateAll,
|
|
93
|
+
prmTrainingPairs,
|
|
94
|
+
renameVariables,
|
|
95
|
+
runAdaptationCurve,
|
|
96
|
+
runComputeCurve,
|
|
97
|
+
runContaminationProbe,
|
|
98
|
+
runRLCampaign,
|
|
99
|
+
runwiseStepRewardSummary,
|
|
100
|
+
selfConsistency,
|
|
101
|
+
selfNormalizedImportanceWeighting,
|
|
102
|
+
shuffleOrder,
|
|
103
|
+
stepRewardsToJsonl,
|
|
104
|
+
thompsonCurriculum,
|
|
105
|
+
toAnthropicFormat,
|
|
106
|
+
toDpoJsonl,
|
|
107
|
+
toDpoRows,
|
|
108
|
+
toGrpoJsonl,
|
|
109
|
+
toGrpoRows,
|
|
110
|
+
toPrmJsonl,
|
|
111
|
+
toPrmRows,
|
|
112
|
+
toSftJsonl,
|
|
113
|
+
toSftRows,
|
|
114
|
+
toTRLFormat,
|
|
115
|
+
trialToRunRecord,
|
|
116
|
+
trialsToRunRecords,
|
|
117
|
+
varianceBasedCurriculum,
|
|
118
|
+
variantAggregateToRunRecord,
|
|
119
|
+
verificationReportToRunRecord
|
|
120
|
+
} from "./chunk-LZKIOBG2.js";
|
|
121
|
+
import {
|
|
122
|
+
runEvalCampaign
|
|
123
|
+
} from "./chunk-EXGR4XEM.js";
|
|
124
|
+
import {
|
|
125
|
+
LlmCallError,
|
|
126
|
+
LlmClient,
|
|
127
|
+
LlmRouteAssertionError,
|
|
128
|
+
assertLlmRoute,
|
|
129
|
+
callLlm,
|
|
130
|
+
callLlmJson,
|
|
131
|
+
probeLlm,
|
|
132
|
+
stripFencedJson
|
|
133
|
+
} from "./chunk-KAO3Q65R.js";
|
|
134
|
+
import {
|
|
135
|
+
evaluateInterimReleaseConfidence,
|
|
136
|
+
pairedEvalueSequence,
|
|
137
|
+
rubricPredictiveValidity
|
|
138
|
+
} from "./chunk-AXHNWLIX.js";
|
|
82
139
|
import {
|
|
140
|
+
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
83
141
|
benjaminiHochberg,
|
|
84
142
|
bhAdjust,
|
|
85
143
|
bonferroni,
|
|
86
144
|
cohensD,
|
|
87
145
|
confidenceInterval,
|
|
146
|
+
gainHistogram,
|
|
88
147
|
interRaterReliability,
|
|
89
148
|
mannWhitneyU,
|
|
90
149
|
normalizeScores,
|
|
91
150
|
pairedBootstrap,
|
|
92
151
|
pairedTTest,
|
|
93
152
|
pairedWilcoxon,
|
|
153
|
+
paretoChart,
|
|
94
154
|
partialCredit,
|
|
95
155
|
requiredSampleSize,
|
|
156
|
+
researchReport,
|
|
157
|
+
summaryTable,
|
|
96
158
|
weightedMean,
|
|
97
159
|
wilcoxonSignedRank
|
|
98
|
-
} from "./chunk-
|
|
160
|
+
} from "./chunk-IOXMGMHQ.js";
|
|
99
161
|
import {
|
|
100
162
|
DEFAULT_REDACTION_RULES,
|
|
101
163
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
@@ -105,7 +167,8 @@ import {
|
|
|
105
167
|
OTEL_AGENT_EVAL_SCOPE,
|
|
106
168
|
OtlpFileTraceStore,
|
|
107
169
|
REDACTION_VERSION,
|
|
108
|
-
|
|
170
|
+
ReplayCache,
|
|
171
|
+
ReplayCacheMissError,
|
|
109
172
|
SpanNotFoundError,
|
|
110
173
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
111
174
|
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
@@ -117,10 +180,10 @@ import {
|
|
|
117
180
|
aggregateLlm,
|
|
118
181
|
analyzeTraces,
|
|
119
182
|
argHash,
|
|
120
|
-
assertRunCaptured,
|
|
121
183
|
buildTraceAnalystTools,
|
|
122
184
|
buildTraceInsightContext,
|
|
123
185
|
buildTraceInsightPrompt,
|
|
186
|
+
createReplayFetch,
|
|
124
187
|
defaultTraceInsightPanel,
|
|
125
188
|
describeTraceInsightScope,
|
|
126
189
|
domainEvidencePattern,
|
|
@@ -132,6 +195,7 @@ import {
|
|
|
132
195
|
isRetrievalSpan,
|
|
133
196
|
isSandboxSpan,
|
|
134
197
|
isToolSpan,
|
|
198
|
+
iterateRawCalls,
|
|
135
199
|
judgeSpans,
|
|
136
200
|
llmSpans,
|
|
137
201
|
planTraceInsightQuestions,
|
|
@@ -140,33 +204,34 @@ import {
|
|
|
140
204
|
runFailureClass,
|
|
141
205
|
runsForScenario,
|
|
142
206
|
scoreTraceInsightReadiness,
|
|
143
|
-
throwIfRunIncomplete,
|
|
144
207
|
tokenizeDomainWords,
|
|
145
208
|
toolSpans,
|
|
146
209
|
traceAnalystFunctionGroup,
|
|
147
210
|
traceAnalystOnRunComplete
|
|
148
|
-
} from "./chunk-
|
|
149
|
-
import {
|
|
150
|
-
TraceEmitter,
|
|
151
|
-
llmSpanFromProvider
|
|
152
|
-
} from "./chunk-5IIQKMD5.js";
|
|
211
|
+
} from "./chunk-4W4NCYM2.js";
|
|
153
212
|
import {
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
callLlm,
|
|
159
|
-
callLlmJson,
|
|
160
|
-
probeLlm,
|
|
161
|
-
stripFencedJson
|
|
162
|
-
} from "./chunk-3GN6U53I.js";
|
|
213
|
+
RunIntegrityError,
|
|
214
|
+
assertRunCaptured,
|
|
215
|
+
throwIfRunIncomplete
|
|
216
|
+
} from "./chunk-QUKKGHTZ.js";
|
|
163
217
|
import {
|
|
164
218
|
FileSystemRawProviderSink,
|
|
165
219
|
InMemoryRawProviderSink,
|
|
166
220
|
NoopRawProviderSink,
|
|
167
221
|
defaultProviderRedactor,
|
|
168
222
|
providerFromBaseUrl
|
|
169
|
-
} from "./chunk-
|
|
223
|
+
} from "./chunk-SQQLHODJ.js";
|
|
224
|
+
import {
|
|
225
|
+
TraceEmitter,
|
|
226
|
+
llmSpanFromProvider
|
|
227
|
+
} from "./chunk-5IIQKMD5.js";
|
|
228
|
+
import {
|
|
229
|
+
canonicalize,
|
|
230
|
+
evaluateHypothesis,
|
|
231
|
+
hashJson,
|
|
232
|
+
signManifest,
|
|
233
|
+
verifyManifest
|
|
234
|
+
} from "./chunk-6M774GY6.js";
|
|
170
235
|
import "./chunk-PZ5AY32C.js";
|
|
171
236
|
|
|
172
237
|
// src/client.ts
|
|
@@ -10486,12 +10551,15 @@ export {
|
|
|
10486
10551
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10487
10552
|
OtlpFileTraceStore,
|
|
10488
10553
|
PairwiseSteeringOptimizer,
|
|
10554
|
+
PredictiveValidityResearcher,
|
|
10489
10555
|
PrmGrader,
|
|
10490
10556
|
ProductClient,
|
|
10491
10557
|
ProjectRegistry,
|
|
10492
10558
|
PromptRegistry,
|
|
10493
10559
|
REDACTION_VERSION,
|
|
10494
10560
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
10561
|
+
ReplayCache,
|
|
10562
|
+
ReplayCacheMissError,
|
|
10495
10563
|
RunCritic,
|
|
10496
10564
|
RunIntegrityError,
|
|
10497
10565
|
RunRecordValidationError,
|
|
@@ -10513,12 +10581,15 @@ export {
|
|
|
10513
10581
|
UNIVERSAL_FINDERS,
|
|
10514
10582
|
acquisitionPlansForKnowledgeGaps,
|
|
10515
10583
|
adversarialJudge,
|
|
10584
|
+
adversarialScenarioSearch,
|
|
10516
10585
|
aggregateLlm,
|
|
10517
10586
|
aggregateRunScore,
|
|
10518
10587
|
allCriticalPassed,
|
|
10519
10588
|
analyzeAntiSlop,
|
|
10589
|
+
analyzeOptimizationResult,
|
|
10520
10590
|
analyzeSeries,
|
|
10521
10591
|
analyzeTraces,
|
|
10592
|
+
applyEloUpdate,
|
|
10522
10593
|
argHash,
|
|
10523
10594
|
assertLlmRoute,
|
|
10524
10595
|
assertReleaseConfidence,
|
|
@@ -10528,12 +10599,14 @@ export {
|
|
|
10528
10599
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
10529
10600
|
benchmarks_exports as benchmarks,
|
|
10530
10601
|
benjaminiHochberg,
|
|
10602
|
+
bestOfN,
|
|
10531
10603
|
bhAdjust,
|
|
10532
10604
|
bisect,
|
|
10533
10605
|
blockingKnowledgeEval,
|
|
10534
10606
|
bonferroni,
|
|
10535
10607
|
bootstrapCi,
|
|
10536
10608
|
budgetBreachView,
|
|
10609
|
+
buildPairwiseFromCampaign,
|
|
10537
10610
|
buildReflectionPrompt,
|
|
10538
10611
|
buildReviewerPrompt,
|
|
10539
10612
|
buildTraceAnalystTools,
|
|
@@ -10559,6 +10632,7 @@ export {
|
|
|
10559
10632
|
coherenceJudge,
|
|
10560
10633
|
collectionPreserved,
|
|
10561
10634
|
commitBisect,
|
|
10635
|
+
compareAdaptationCurves,
|
|
10562
10636
|
compareReferenceReplay,
|
|
10563
10637
|
compareToBaseline,
|
|
10564
10638
|
compilerJudge,
|
|
@@ -10580,6 +10654,7 @@ export {
|
|
|
10580
10654
|
createFeedbackTrajectory,
|
|
10581
10655
|
createIntentMatchJudge,
|
|
10582
10656
|
createLlmReviewer,
|
|
10657
|
+
createReplayFetch,
|
|
10583
10658
|
createSandboxCodeMutator,
|
|
10584
10659
|
createSandboxPool,
|
|
10585
10660
|
createSemanticConceptJudge,
|
|
@@ -10594,15 +10669,18 @@ export {
|
|
|
10594
10669
|
defaultTraceInsightPanel,
|
|
10595
10670
|
deployGateLayer,
|
|
10596
10671
|
describeTraceInsightScope,
|
|
10672
|
+
detectRewardHacking,
|
|
10597
10673
|
distillPlaybook,
|
|
10598
10674
|
domainEvidencePattern,
|
|
10599
10675
|
dominates,
|
|
10676
|
+
doublyRobust,
|
|
10600
10677
|
estimateCost,
|
|
10601
10678
|
estimateTokens,
|
|
10602
10679
|
euAiActReport,
|
|
10603
10680
|
evaluateActionPolicy,
|
|
10604
10681
|
evaluateContract,
|
|
10605
10682
|
evaluateHypothesis,
|
|
10683
|
+
evaluateInterimReleaseConfidence,
|
|
10606
10684
|
evaluateOracles,
|
|
10607
10685
|
evaluateReleaseConfidence,
|
|
10608
10686
|
executeScenario,
|
|
@@ -10612,6 +10690,10 @@ export {
|
|
|
10612
10690
|
exportTrainingData,
|
|
10613
10691
|
extractAssetUrls,
|
|
10614
10692
|
extractErrorCount,
|
|
10693
|
+
extractPreferences,
|
|
10694
|
+
extractStepRewards,
|
|
10695
|
+
extractVerifiableReward,
|
|
10696
|
+
extractVerifiableRewardsFromRecords,
|
|
10615
10697
|
failureClusterView,
|
|
10616
10698
|
feedbackTrajectoriesToDatasetScenarios,
|
|
10617
10699
|
feedbackTrajectoriesToOptimizerRows,
|
|
@@ -10619,12 +10701,15 @@ export {
|
|
|
10619
10701
|
feedbackTrajectoryToOptimizerRow,
|
|
10620
10702
|
fileContains,
|
|
10621
10703
|
fileExists,
|
|
10704
|
+
filterDeterministicallyRewarded,
|
|
10622
10705
|
findAutoMatchNoExpectation,
|
|
10623
10706
|
findConstructorCwdDropped,
|
|
10624
10707
|
findFallbackToPass,
|
|
10625
10708
|
findLiteralTruePass,
|
|
10626
10709
|
findSkipCountsAsPass,
|
|
10627
10710
|
firstDivergenceView,
|
|
10711
|
+
firstPassK,
|
|
10712
|
+
fitBradleyTerry,
|
|
10628
10713
|
flowLayer,
|
|
10629
10714
|
formatBenchmarkReport,
|
|
10630
10715
|
formatDriverReport,
|
|
@@ -10640,12 +10725,14 @@ export {
|
|
|
10640
10725
|
inMemoryReferenceReplayStore,
|
|
10641
10726
|
inMemoryReviewStore,
|
|
10642
10727
|
inferDomainKeywords,
|
|
10728
|
+
injectIrrelevantClause,
|
|
10643
10729
|
integrationAsi,
|
|
10644
10730
|
integrationGateEvals,
|
|
10645
10731
|
integrationInvokeFailedPayload,
|
|
10646
10732
|
integrationManifestResolvedPayload,
|
|
10647
10733
|
integrationManifestValidatedPayload,
|
|
10648
10734
|
interRaterReliability,
|
|
10735
|
+
inverseProbabilityWeighting,
|
|
10649
10736
|
iqr,
|
|
10650
10737
|
isJudgeSpan,
|
|
10651
10738
|
isLlmSpan,
|
|
@@ -10654,6 +10741,7 @@ export {
|
|
|
10654
10741
|
isRunRecord,
|
|
10655
10742
|
isSandboxSpan,
|
|
10656
10743
|
isToolSpan,
|
|
10744
|
+
iterateRawCalls,
|
|
10657
10745
|
jestTestParser,
|
|
10658
10746
|
jsonHasKeys,
|
|
10659
10747
|
jsonShape,
|
|
@@ -10680,8 +10768,11 @@ export {
|
|
|
10680
10768
|
normalizeScores,
|
|
10681
10769
|
notBlocked,
|
|
10682
10770
|
objectiveEval,
|
|
10771
|
+
observationsFromRunRecords,
|
|
10772
|
+
offPolicyEstimateAll,
|
|
10683
10773
|
outputLengthRubric,
|
|
10684
10774
|
pairedBootstrap,
|
|
10775
|
+
pairedEvalueSequence,
|
|
10685
10776
|
pairedTTest,
|
|
10686
10777
|
pairedWilcoxon,
|
|
10687
10778
|
paraphraseRobustness,
|
|
@@ -10701,6 +10792,7 @@ export {
|
|
|
10701
10792
|
printDriverSummary,
|
|
10702
10793
|
prmBestOfN,
|
|
10703
10794
|
prmEnsembleBestOfN,
|
|
10795
|
+
prmTrainingPairs,
|
|
10704
10796
|
probeLlm,
|
|
10705
10797
|
promptBisect,
|
|
10706
10798
|
proposeSynthesisTargets,
|
|
@@ -10716,6 +10808,7 @@ export {
|
|
|
10716
10808
|
regexMatches,
|
|
10717
10809
|
regressionView,
|
|
10718
10810
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
10811
|
+
renameVariables,
|
|
10719
10812
|
renderMarkdown,
|
|
10720
10813
|
renderMarkdownReport,
|
|
10721
10814
|
renderPlaybookMarkdown,
|
|
@@ -10733,12 +10826,17 @@ export {
|
|
|
10733
10826
|
roundTripRunRecord,
|
|
10734
10827
|
rowCount,
|
|
10735
10828
|
rowWhere,
|
|
10829
|
+
rubricPredictiveValidity,
|
|
10830
|
+
runAdaptationCurve,
|
|
10736
10831
|
runAgentControlLoop,
|
|
10737
10832
|
runAssertions,
|
|
10738
10833
|
runBehavioralCanaries,
|
|
10739
10834
|
runCanaries,
|
|
10835
|
+
runComputeCurve,
|
|
10836
|
+
runContaminationProbe,
|
|
10740
10837
|
runCounterfactual,
|
|
10741
10838
|
runE2EWorkflow,
|
|
10839
|
+
runEvalCampaign,
|
|
10742
10840
|
runExpectations,
|
|
10743
10841
|
runFailureClass,
|
|
10744
10842
|
runHarnessExperiment,
|
|
@@ -10751,11 +10849,13 @@ export {
|
|
|
10751
10849
|
runPromptEvolution,
|
|
10752
10850
|
runProposeReview,
|
|
10753
10851
|
runProposeReviewAsControlLoop,
|
|
10852
|
+
runRLCampaign,
|
|
10754
10853
|
runReferenceReplay,
|
|
10755
10854
|
runSelfPlay,
|
|
10756
10855
|
runSemanticConceptJudge,
|
|
10757
10856
|
runTestGradedScenario,
|
|
10758
10857
|
runsForScenario,
|
|
10858
|
+
runwiseStepRewardSummary,
|
|
10759
10859
|
scalarScore,
|
|
10760
10860
|
scanForMuffledGates,
|
|
10761
10861
|
scoreAllProjects,
|
|
@@ -10768,12 +10868,16 @@ export {
|
|
|
10768
10868
|
scoreTraceInsightReadiness,
|
|
10769
10869
|
securityJudge,
|
|
10770
10870
|
selectHarnessVariant,
|
|
10871
|
+
selfConsistency,
|
|
10872
|
+
selfNormalizedImportanceWeighting,
|
|
10771
10873
|
selfPreference,
|
|
10772
10874
|
sentenceReorderMutator,
|
|
10773
10875
|
serializeFeedbackTrajectoriesJsonl,
|
|
10876
|
+
shuffleOrder,
|
|
10774
10877
|
signManifest,
|
|
10775
10878
|
soc2Report,
|
|
10776
10879
|
statusAdvanced,
|
|
10880
|
+
stepRewardsToJsonl,
|
|
10777
10881
|
stopOnNoProgress,
|
|
10778
10882
|
stopOnRepeatedAction,
|
|
10779
10883
|
stripFencedJson,
|
|
@@ -10785,10 +10889,21 @@ export {
|
|
|
10785
10889
|
summaryTable,
|
|
10786
10890
|
testJudge,
|
|
10787
10891
|
textInSnapshot,
|
|
10892
|
+
thompsonCurriculum,
|
|
10788
10893
|
throwIfRunIncomplete,
|
|
10894
|
+
toAnthropicFormat,
|
|
10895
|
+
toDpoJsonl,
|
|
10896
|
+
toDpoRows,
|
|
10897
|
+
toGrpoJsonl,
|
|
10898
|
+
toGrpoRows,
|
|
10789
10899
|
toLangfuseEnvelope,
|
|
10790
10900
|
toNdjson,
|
|
10901
|
+
toPrmJsonl,
|
|
10902
|
+
toPrmRows,
|
|
10791
10903
|
toPrometheusText,
|
|
10904
|
+
toSftJsonl,
|
|
10905
|
+
toSftRows,
|
|
10906
|
+
toTRLFormat,
|
|
10792
10907
|
tokenizeDomainWords,
|
|
10793
10908
|
toolIntentAlignmentRubric,
|
|
10794
10909
|
toolNamesForRun,
|
|
@@ -10798,12 +10913,17 @@ export {
|
|
|
10798
10913
|
toolWasteView,
|
|
10799
10914
|
traceAnalystFunctionGroup,
|
|
10800
10915
|
traceAnalystOnRunComplete,
|
|
10916
|
+
trialToRunRecord,
|
|
10801
10917
|
trialTraceFromMultiShotTrial,
|
|
10918
|
+
trialsToRunRecords,
|
|
10802
10919
|
typoMutator,
|
|
10803
10920
|
urlContains,
|
|
10804
10921
|
userQuestionsForKnowledgeGaps,
|
|
10805
10922
|
validateRunRecord,
|
|
10923
|
+
varianceBasedCurriculum,
|
|
10924
|
+
variantAggregateToRunRecord,
|
|
10806
10925
|
verbosityBias,
|
|
10926
|
+
verificationReportToRunRecord,
|
|
10807
10927
|
verifyManifest,
|
|
10808
10928
|
visualDiff,
|
|
10809
10929
|
viteDeployRunner,
|