@tangle-network/agent-eval 0.32.0 → 0.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +39 -486
- package/dist/index.js +75 -68
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +22 -12
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-KE7TDJUO.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-WRGHMGWT.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -96,14 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
100
|
-
import {
|
|
101
|
-
RunRecordValidationError,
|
|
102
|
-
isRunRecord,
|
|
103
|
-
parseRunRecordSafe,
|
|
104
|
-
roundTripRunRecord,
|
|
105
|
-
validateRunRecord
|
|
106
|
-
} from "./chunk-ZN2CMQIW.js";
|
|
99
|
+
} from "./chunk-FT3IAMQR.js";
|
|
107
100
|
import {
|
|
108
101
|
assertReleaseConfidence,
|
|
109
102
|
bootstrapCi,
|
|
@@ -111,38 +104,52 @@ import {
|
|
|
111
104
|
judgeReplayGate,
|
|
112
105
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
106
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
107
|
+
} from "./chunk-LGAPK7NA.js";
|
|
115
108
|
import {
|
|
116
109
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
110
|
+
} from "./chunk-SQYRO3BT.js";
|
|
118
111
|
import {
|
|
119
112
|
LlmCallError,
|
|
120
113
|
LlmClient,
|
|
121
114
|
LlmRouteAssertionError,
|
|
122
115
|
assertLlmRoute,
|
|
116
|
+
backoffMs,
|
|
123
117
|
callLlm,
|
|
124
118
|
callLlmJson,
|
|
119
|
+
isTransientLlmError,
|
|
125
120
|
probeLlm,
|
|
126
121
|
stripFencedJson
|
|
127
|
-
} from "./chunk-
|
|
122
|
+
} from "./chunk-VXNVVBZO.js";
|
|
123
|
+
import {
|
|
124
|
+
AgentProfileCellValidationError,
|
|
125
|
+
RunRecordValidationError,
|
|
126
|
+
agentProfileCellHashMaterial,
|
|
127
|
+
agentProfileCellKey,
|
|
128
|
+
assertRunAgentProfileCell,
|
|
129
|
+
buildAgentProfileCell,
|
|
130
|
+
groupRunsByAgentProfileCell,
|
|
131
|
+
isRunRecord,
|
|
132
|
+
parseRunRecordSafe,
|
|
133
|
+
requireAgentProfileCell,
|
|
134
|
+
roundTripRunRecord,
|
|
135
|
+
validateAgentProfileCell,
|
|
136
|
+
validateRunRecord,
|
|
137
|
+
verifyAgentProfileCell
|
|
138
|
+
} from "./chunk-DCZXFOQN.js";
|
|
128
139
|
import {
|
|
129
140
|
evaluateInterimReleaseConfidence,
|
|
130
141
|
pairedEvalueSequence
|
|
131
142
|
} from "./chunk-MAZ26DC7.js";
|
|
132
143
|
import {
|
|
133
144
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
|
-
benjaminiHochberg,
|
|
135
|
-
bhAdjust,
|
|
136
|
-
bonferroni,
|
|
137
145
|
gainHistogram,
|
|
138
|
-
pairedBootstrap,
|
|
139
|
-
pairedWilcoxon,
|
|
140
146
|
paretoChart,
|
|
141
|
-
requiredSampleSize,
|
|
142
147
|
researchReport,
|
|
143
148
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
149
|
+
} from "./chunk-TQL7BAOY.js";
|
|
145
150
|
import {
|
|
151
|
+
benjaminiHochberg,
|
|
152
|
+
bonferroni,
|
|
146
153
|
calibrateJudge,
|
|
147
154
|
calibrateJudgeContinuous,
|
|
148
155
|
cohensD,
|
|
@@ -153,14 +160,17 @@ import {
|
|
|
153
160
|
interRaterReliability,
|
|
154
161
|
mannWhitneyU,
|
|
155
162
|
normalizeScores,
|
|
163
|
+
pairedBootstrap,
|
|
164
|
+
pairedMde,
|
|
156
165
|
pairedTTest,
|
|
157
166
|
partialCredit,
|
|
158
167
|
positionalBias,
|
|
168
|
+
requiredSampleSize,
|
|
159
169
|
selfPreference,
|
|
160
170
|
verbosityBias,
|
|
161
171
|
weightedMean,
|
|
162
172
|
wilcoxonSignedRank
|
|
163
|
-
} from "./chunk-
|
|
173
|
+
} from "./chunk-KHZRNY3F.js";
|
|
164
174
|
import {
|
|
165
175
|
DEFAULT_REDACTION_RULES,
|
|
166
176
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
@@ -2991,29 +3001,13 @@ var AgentDriver = class {
|
|
|
2991
3001
|
}
|
|
2992
3002
|
/** Use the driver LLM to decide what the "user" says next */
|
|
2993
3003
|
async decideNextMessage(persona, state, history) {
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
role: "system",
|
|
3001
|
-
content: buildDriverSystemPrompt(persona, state, this.productContext)
|
|
3002
|
-
},
|
|
3003
|
-
{
|
|
3004
|
-
role: "user",
|
|
3005
|
-
content: recentHistory ? `Recent conversation:
|
|
3006
|
-
${recentHistory}
|
|
3007
|
-
|
|
3008
|
-
The agent's latest response:
|
|
3009
|
-
${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
|
|
3010
|
-
}
|
|
3011
|
-
],
|
|
3012
|
-
temperature: 0.5,
|
|
3013
|
-
maxTokens: 700
|
|
3004
|
+
return decideNextUserTurn(this.tc, {
|
|
3005
|
+
persona,
|
|
3006
|
+
state,
|
|
3007
|
+
history,
|
|
3008
|
+
productContext: this.productContext,
|
|
3009
|
+
model: this.driverModel
|
|
3014
3010
|
});
|
|
3015
|
-
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
3016
|
-
return content.trim();
|
|
3017
3011
|
}
|
|
3018
3012
|
/** Handle pending approvals based on persona feedback patterns */
|
|
3019
3013
|
async handleApprovals(persona, workspaceId, _state) {
|
|
@@ -3081,6 +3075,29 @@ Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on t
|
|
|
3081
3075
|
|
|
3082
3076
|
Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
|
|
3083
3077
|
}
|
|
3078
|
+
async function decideNextUserTurn(tc, opts) {
|
|
3079
|
+
const { persona, state, history, productContext = "", model = "claude-sonnet-4-6" } = opts;
|
|
3080
|
+
const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
|
|
3081
|
+
const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
|
|
3082
|
+
const resp = await tc.chat({
|
|
3083
|
+
model,
|
|
3084
|
+
messages: [
|
|
3085
|
+
{ role: "system", content: buildDriverSystemPrompt(persona, state, productContext) },
|
|
3086
|
+
{
|
|
3087
|
+
role: "user",
|
|
3088
|
+
content: recentHistory ? `Recent conversation:
|
|
3089
|
+
${recentHistory}
|
|
3090
|
+
|
|
3091
|
+
The agent's latest response:
|
|
3092
|
+
${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
|
|
3093
|
+
}
|
|
3094
|
+
],
|
|
3095
|
+
temperature: 0.5,
|
|
3096
|
+
maxTokens: 700
|
|
3097
|
+
});
|
|
3098
|
+
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
3099
|
+
return content.trim();
|
|
3100
|
+
}
|
|
3084
3101
|
|
|
3085
3102
|
// src/integration-gates.ts
|
|
3086
3103
|
function integrationManifestValidatedPayload(input) {
|
|
@@ -10309,35 +10326,14 @@ var JsonlTrialCache = class {
|
|
|
10309
10326
|
// src/judge-retry.ts
|
|
10310
10327
|
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
10311
10328
|
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
10312
|
-
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
10313
|
-
var ABORT_PATTERNS = [
|
|
10314
|
-
/AbortError/i,
|
|
10315
|
-
/TimeoutError/i,
|
|
10316
|
-
/fetch failed/i,
|
|
10317
|
-
/ECONNRESET/i,
|
|
10318
|
-
/ETIMEDOUT/i,
|
|
10319
|
-
/EAI_AGAIN/i,
|
|
10320
|
-
/this operation was aborted/i,
|
|
10321
|
-
/stream.*ended.*unexpectedly/i,
|
|
10322
|
-
/socket hang up/i
|
|
10323
|
-
];
|
|
10324
|
-
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
10325
|
-
function defaultIsRetryable(err) {
|
|
10326
|
-
if (err instanceof Error) {
|
|
10327
|
-
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
10328
|
-
const status = err.status;
|
|
10329
|
-
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
10330
|
-
}
|
|
10331
|
-
return false;
|
|
10332
|
-
}
|
|
10333
10329
|
function sleep(ms) {
|
|
10334
10330
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
10335
10331
|
}
|
|
10336
10332
|
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
10337
10333
|
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
10338
10334
|
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
10339
|
-
const backoff = policy.backoffMs ??
|
|
10340
|
-
const isRetryable = policy.isRetryable ??
|
|
10335
|
+
const backoff = policy.backoffMs ?? backoffMs;
|
|
10336
|
+
const isRetryable = policy.isRetryable ?? isTransientLlmError;
|
|
10341
10337
|
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
10342
10338
|
let totalAttempts = 0;
|
|
10343
10339
|
const attemptErrors = [];
|
|
@@ -10660,6 +10656,7 @@ export {
|
|
|
10660
10656
|
ANALYST_SEVERITIES,
|
|
10661
10657
|
AgentDriver,
|
|
10662
10658
|
AgentEvalError,
|
|
10659
|
+
AgentProfileCellValidationError,
|
|
10663
10660
|
AnalystRegistry,
|
|
10664
10661
|
AxGepaSteeringOptimizer,
|
|
10665
10662
|
BENCHMARK_SPLIT_SEED,
|
|
@@ -10767,6 +10764,8 @@ export {
|
|
|
10767
10764
|
VerificationError,
|
|
10768
10765
|
acquisitionPlansForKnowledgeGaps,
|
|
10769
10766
|
adversarialJudge,
|
|
10767
|
+
agentProfileCellHashMaterial,
|
|
10768
|
+
agentProfileCellKey,
|
|
10770
10769
|
aggregateLlm,
|
|
10771
10770
|
aggregateRunScore,
|
|
10772
10771
|
aggregateTrialsByMode,
|
|
@@ -10778,17 +10777,19 @@ export {
|
|
|
10778
10777
|
assertLlmRoute,
|
|
10779
10778
|
assertRealBackend,
|
|
10780
10779
|
assertReleaseConfidence,
|
|
10780
|
+
assertRunAgentProfileCell,
|
|
10781
10781
|
assertRunCaptured,
|
|
10782
10782
|
assignFeedbackSplit,
|
|
10783
10783
|
attributeCounterfactuals,
|
|
10784
|
+
backoffMs,
|
|
10784
10785
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
10785
10786
|
benchmarks_exports as benchmarks,
|
|
10786
10787
|
benjaminiHochberg,
|
|
10787
|
-
bhAdjust,
|
|
10788
10788
|
bisect,
|
|
10789
10789
|
blockingKnowledgeEval,
|
|
10790
10790
|
bonferroni,
|
|
10791
10791
|
bootstrapCi,
|
|
10792
|
+
buildAgentProfileCell,
|
|
10792
10793
|
buildDriverSystemPrompt,
|
|
10793
10794
|
buildReflectionPrompt,
|
|
10794
10795
|
buildReviewerPrompt,
|
|
@@ -10853,6 +10854,7 @@ export {
|
|
|
10853
10854
|
createVerifierAdapter,
|
|
10854
10855
|
crossTraceDiff,
|
|
10855
10856
|
crowdingDistance,
|
|
10857
|
+
decideNextUserTurn,
|
|
10856
10858
|
decideReferenceReplayPromotion,
|
|
10857
10859
|
decideReferenceReplayRunPromotion,
|
|
10858
10860
|
defaultIsMaterial,
|
|
@@ -10904,6 +10906,7 @@ export {
|
|
|
10904
10906
|
precision as goldenPrecision,
|
|
10905
10907
|
gradeSemanticStatus,
|
|
10906
10908
|
groupBy,
|
|
10909
|
+
groupRunsByAgentProfileCell,
|
|
10907
10910
|
hashContent,
|
|
10908
10911
|
hashJson,
|
|
10909
10912
|
hashScenarios,
|
|
@@ -10925,6 +10928,7 @@ export {
|
|
|
10925
10928
|
isRunRecord,
|
|
10926
10929
|
isSandboxSpan,
|
|
10927
10930
|
isToolSpan,
|
|
10931
|
+
isTransientLlmError,
|
|
10928
10932
|
iterateRawCalls,
|
|
10929
10933
|
jestTestParser,
|
|
10930
10934
|
jsonHasKeys,
|
|
@@ -10954,8 +10958,8 @@ export {
|
|
|
10954
10958
|
objectiveEval,
|
|
10955
10959
|
pairedBootstrap,
|
|
10956
10960
|
pairedEvalueSequence,
|
|
10961
|
+
pairedMde,
|
|
10957
10962
|
pairedTTest,
|
|
10958
|
-
pairedWilcoxon,
|
|
10959
10963
|
paraphraseRobustness,
|
|
10960
10964
|
paraphraseRobustnessScenarios,
|
|
10961
10965
|
paretoChart,
|
|
@@ -11001,6 +11005,7 @@ export {
|
|
|
11001
11005
|
replayFeedbackTrajectory,
|
|
11002
11006
|
replayScorerOverCorpus,
|
|
11003
11007
|
replayTraceThroughJudge,
|
|
11008
|
+
requireAgentProfileCell,
|
|
11004
11009
|
requiredSampleSize,
|
|
11005
11010
|
researchReport,
|
|
11006
11011
|
resetLockedAppendersForTesting,
|
|
@@ -11071,8 +11076,10 @@ export {
|
|
|
11071
11076
|
typoMutator,
|
|
11072
11077
|
urlContains,
|
|
11073
11078
|
userQuestionsForKnowledgeGaps,
|
|
11079
|
+
validateAgentProfileCell,
|
|
11074
11080
|
validateRunRecord,
|
|
11075
11081
|
verbosityBias,
|
|
11082
|
+
verifyAgentProfileCell,
|
|
11076
11083
|
verifyCompletion,
|
|
11077
11084
|
verifyManifest,
|
|
11078
11085
|
visualDiff,
|