@tangle-network/agent-eval 0.33.0 → 0.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +18 -486
- package/dist/index.js +45 -46
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +1 -1
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-KE7TDJUO.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-WRGHMGWT.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -96,14 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
100
|
-
import {
|
|
101
|
-
RunRecordValidationError,
|
|
102
|
-
isRunRecord,
|
|
103
|
-
parseRunRecordSafe,
|
|
104
|
-
roundTripRunRecord,
|
|
105
|
-
validateRunRecord
|
|
106
|
-
} from "./chunk-ZN2CMQIW.js";
|
|
99
|
+
} from "./chunk-FT3IAMQR.js";
|
|
107
100
|
import {
|
|
108
101
|
assertReleaseConfidence,
|
|
109
102
|
bootstrapCi,
|
|
@@ -111,38 +104,52 @@ import {
|
|
|
111
104
|
judgeReplayGate,
|
|
112
105
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
106
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
107
|
+
} from "./chunk-LGAPK7NA.js";
|
|
115
108
|
import {
|
|
116
109
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
110
|
+
} from "./chunk-SQYRO3BT.js";
|
|
118
111
|
import {
|
|
119
112
|
LlmCallError,
|
|
120
113
|
LlmClient,
|
|
121
114
|
LlmRouteAssertionError,
|
|
122
115
|
assertLlmRoute,
|
|
116
|
+
backoffMs,
|
|
123
117
|
callLlm,
|
|
124
118
|
callLlmJson,
|
|
119
|
+
isTransientLlmError,
|
|
125
120
|
probeLlm,
|
|
126
121
|
stripFencedJson
|
|
127
|
-
} from "./chunk-
|
|
122
|
+
} from "./chunk-VXNVVBZO.js";
|
|
123
|
+
import {
|
|
124
|
+
AgentProfileCellValidationError,
|
|
125
|
+
RunRecordValidationError,
|
|
126
|
+
agentProfileCellHashMaterial,
|
|
127
|
+
agentProfileCellKey,
|
|
128
|
+
assertRunAgentProfileCell,
|
|
129
|
+
buildAgentProfileCell,
|
|
130
|
+
groupRunsByAgentProfileCell,
|
|
131
|
+
isRunRecord,
|
|
132
|
+
parseRunRecordSafe,
|
|
133
|
+
requireAgentProfileCell,
|
|
134
|
+
roundTripRunRecord,
|
|
135
|
+
validateAgentProfileCell,
|
|
136
|
+
validateRunRecord,
|
|
137
|
+
verifyAgentProfileCell
|
|
138
|
+
} from "./chunk-DCZXFOQN.js";
|
|
128
139
|
import {
|
|
129
140
|
evaluateInterimReleaseConfidence,
|
|
130
141
|
pairedEvalueSequence
|
|
131
142
|
} from "./chunk-MAZ26DC7.js";
|
|
132
143
|
import {
|
|
133
144
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
|
-
benjaminiHochberg,
|
|
135
|
-
bhAdjust,
|
|
136
|
-
bonferroni,
|
|
137
145
|
gainHistogram,
|
|
138
|
-
pairedBootstrap,
|
|
139
|
-
pairedWilcoxon,
|
|
140
146
|
paretoChart,
|
|
141
|
-
requiredSampleSize,
|
|
142
147
|
researchReport,
|
|
143
148
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
149
|
+
} from "./chunk-TQL7BAOY.js";
|
|
145
150
|
import {
|
|
151
|
+
benjaminiHochberg,
|
|
152
|
+
bonferroni,
|
|
146
153
|
calibrateJudge,
|
|
147
154
|
calibrateJudgeContinuous,
|
|
148
155
|
cohensD,
|
|
@@ -153,14 +160,17 @@ import {
|
|
|
153
160
|
interRaterReliability,
|
|
154
161
|
mannWhitneyU,
|
|
155
162
|
normalizeScores,
|
|
163
|
+
pairedBootstrap,
|
|
164
|
+
pairedMde,
|
|
156
165
|
pairedTTest,
|
|
157
166
|
partialCredit,
|
|
158
167
|
positionalBias,
|
|
168
|
+
requiredSampleSize,
|
|
159
169
|
selfPreference,
|
|
160
170
|
verbosityBias,
|
|
161
171
|
weightedMean,
|
|
162
172
|
wilcoxonSignedRank
|
|
163
|
-
} from "./chunk-
|
|
173
|
+
} from "./chunk-KHZRNY3F.js";
|
|
164
174
|
import {
|
|
165
175
|
DEFAULT_REDACTION_RULES,
|
|
166
176
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
@@ -10316,35 +10326,14 @@ var JsonlTrialCache = class {
|
|
|
10316
10326
|
// src/judge-retry.ts
|
|
10317
10327
|
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
10318
10328
|
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
10319
|
-
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
10320
|
-
var ABORT_PATTERNS = [
|
|
10321
|
-
/AbortError/i,
|
|
10322
|
-
/TimeoutError/i,
|
|
10323
|
-
/fetch failed/i,
|
|
10324
|
-
/ECONNRESET/i,
|
|
10325
|
-
/ETIMEDOUT/i,
|
|
10326
|
-
/EAI_AGAIN/i,
|
|
10327
|
-
/this operation was aborted/i,
|
|
10328
|
-
/stream.*ended.*unexpectedly/i,
|
|
10329
|
-
/socket hang up/i
|
|
10330
|
-
];
|
|
10331
|
-
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
10332
|
-
function defaultIsRetryable(err) {
|
|
10333
|
-
if (err instanceof Error) {
|
|
10334
|
-
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
10335
|
-
const status = err.status;
|
|
10336
|
-
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
10337
|
-
}
|
|
10338
|
-
return false;
|
|
10339
|
-
}
|
|
10340
10329
|
function sleep(ms) {
|
|
10341
10330
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
10342
10331
|
}
|
|
10343
10332
|
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
10344
10333
|
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
10345
10334
|
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
10346
|
-
const backoff = policy.backoffMs ??
|
|
10347
|
-
const isRetryable = policy.isRetryable ??
|
|
10335
|
+
const backoff = policy.backoffMs ?? backoffMs;
|
|
10336
|
+
const isRetryable = policy.isRetryable ?? isTransientLlmError;
|
|
10348
10337
|
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
10349
10338
|
let totalAttempts = 0;
|
|
10350
10339
|
const attemptErrors = [];
|
|
@@ -10667,6 +10656,7 @@ export {
|
|
|
10667
10656
|
ANALYST_SEVERITIES,
|
|
10668
10657
|
AgentDriver,
|
|
10669
10658
|
AgentEvalError,
|
|
10659
|
+
AgentProfileCellValidationError,
|
|
10670
10660
|
AnalystRegistry,
|
|
10671
10661
|
AxGepaSteeringOptimizer,
|
|
10672
10662
|
BENCHMARK_SPLIT_SEED,
|
|
@@ -10774,6 +10764,8 @@ export {
|
|
|
10774
10764
|
VerificationError,
|
|
10775
10765
|
acquisitionPlansForKnowledgeGaps,
|
|
10776
10766
|
adversarialJudge,
|
|
10767
|
+
agentProfileCellHashMaterial,
|
|
10768
|
+
agentProfileCellKey,
|
|
10777
10769
|
aggregateLlm,
|
|
10778
10770
|
aggregateRunScore,
|
|
10779
10771
|
aggregateTrialsByMode,
|
|
@@ -10785,17 +10777,19 @@ export {
|
|
|
10785
10777
|
assertLlmRoute,
|
|
10786
10778
|
assertRealBackend,
|
|
10787
10779
|
assertReleaseConfidence,
|
|
10780
|
+
assertRunAgentProfileCell,
|
|
10788
10781
|
assertRunCaptured,
|
|
10789
10782
|
assignFeedbackSplit,
|
|
10790
10783
|
attributeCounterfactuals,
|
|
10784
|
+
backoffMs,
|
|
10791
10785
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
10792
10786
|
benchmarks_exports as benchmarks,
|
|
10793
10787
|
benjaminiHochberg,
|
|
10794
|
-
bhAdjust,
|
|
10795
10788
|
bisect,
|
|
10796
10789
|
blockingKnowledgeEval,
|
|
10797
10790
|
bonferroni,
|
|
10798
10791
|
bootstrapCi,
|
|
10792
|
+
buildAgentProfileCell,
|
|
10799
10793
|
buildDriverSystemPrompt,
|
|
10800
10794
|
buildReflectionPrompt,
|
|
10801
10795
|
buildReviewerPrompt,
|
|
@@ -10912,6 +10906,7 @@ export {
|
|
|
10912
10906
|
precision as goldenPrecision,
|
|
10913
10907
|
gradeSemanticStatus,
|
|
10914
10908
|
groupBy,
|
|
10909
|
+
groupRunsByAgentProfileCell,
|
|
10915
10910
|
hashContent,
|
|
10916
10911
|
hashJson,
|
|
10917
10912
|
hashScenarios,
|
|
@@ -10933,6 +10928,7 @@ export {
|
|
|
10933
10928
|
isRunRecord,
|
|
10934
10929
|
isSandboxSpan,
|
|
10935
10930
|
isToolSpan,
|
|
10931
|
+
isTransientLlmError,
|
|
10936
10932
|
iterateRawCalls,
|
|
10937
10933
|
jestTestParser,
|
|
10938
10934
|
jsonHasKeys,
|
|
@@ -10962,8 +10958,8 @@ export {
|
|
|
10962
10958
|
objectiveEval,
|
|
10963
10959
|
pairedBootstrap,
|
|
10964
10960
|
pairedEvalueSequence,
|
|
10961
|
+
pairedMde,
|
|
10965
10962
|
pairedTTest,
|
|
10966
|
-
pairedWilcoxon,
|
|
10967
10963
|
paraphraseRobustness,
|
|
10968
10964
|
paraphraseRobustnessScenarios,
|
|
10969
10965
|
paretoChart,
|
|
@@ -11009,6 +11005,7 @@ export {
|
|
|
11009
11005
|
replayFeedbackTrajectory,
|
|
11010
11006
|
replayScorerOverCorpus,
|
|
11011
11007
|
replayTraceThroughJudge,
|
|
11008
|
+
requireAgentProfileCell,
|
|
11012
11009
|
requiredSampleSize,
|
|
11013
11010
|
researchReport,
|
|
11014
11011
|
resetLockedAppendersForTesting,
|
|
@@ -11079,8 +11076,10 @@ export {
|
|
|
11079
11076
|
typoMutator,
|
|
11080
11077
|
urlContains,
|
|
11081
11078
|
userQuestionsForKnowledgeGaps,
|
|
11079
|
+
validateAgentProfileCell,
|
|
11082
11080
|
validateRunRecord,
|
|
11083
11081
|
verbosityBias,
|
|
11082
|
+
verifyAgentProfileCell,
|
|
11084
11083
|
verifyCompletion,
|
|
11085
11084
|
verifyManifest,
|
|
11086
11085
|
visualDiff,
|