@tangle-network/agent-eval 0.32.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +30 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/chunk-DCZXFOQN.js +489 -0
  4. package/dist/chunk-DCZXFOQN.js.map +1 -0
  5. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  6. package/dist/chunk-FT3IAMQR.js.map +1 -0
  7. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  8. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  9. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  10. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  11. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  12. package/dist/chunk-SQYRO3BT.js.map +1 -0
  13. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  14. package/dist/chunk-TQL7BAOY.js.map +1 -0
  15. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  16. package/dist/chunk-VXNVVBZO.js.map +1 -0
  17. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  18. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  19. package/dist/cli.js +2 -2
  20. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +3 -2
  23. package/dist/governance/index.d.ts +2 -1
  24. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  25. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  26. package/dist/index.d.ts +39 -486
  27. package/dist/index.js +75 -68
  28. package/dist/index.js.map +1 -1
  29. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  30. package/dist/meta-eval/index.d.ts +2 -2
  31. package/dist/openapi.json +1 -1
  32. package/dist/optimization.d.ts +3 -3
  33. package/dist/optimization.js +6 -6
  34. package/dist/pipelines/index.js +2 -2
  35. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  36. package/dist/reporting.d.ts +6 -4
  37. package/dist/reporting.js +10 -9
  38. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  39. package/dist/rl.d.ts +5 -5
  40. package/dist/rl.js +6 -6
  41. package/dist/rl.js.map +1 -1
  42. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  43. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  44. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  45. package/dist/wire/index.js +2 -2
  46. package/docs/product-eval-adoption.md +18 -0
  47. package/package.json +22 -12
  48. package/dist/chunk-B73G44OH.js.map +0 -1
  49. package/dist/chunk-CXJOVDJR.js.map +0 -1
  50. package/dist/chunk-DTEJNZYK.js.map +0 -1
  51. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  52. package/dist/chunk-ZN2CMQIW.js +0 -208
  53. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  54. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  55. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  56. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  57. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  58. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-GVQT44CS.js";
14
+ } from "./chunk-KE7TDJUO.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-S4Y5VXMS.js";
57
+ } from "./chunk-WRGHMGWT.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,14 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-B73G44OH.js";
100
- import {
101
- RunRecordValidationError,
102
- isRunRecord,
103
- parseRunRecordSafe,
104
- roundTripRunRecord,
105
- validateRunRecord
106
- } from "./chunk-ZN2CMQIW.js";
99
+ } from "./chunk-FT3IAMQR.js";
107
100
  import {
108
101
  assertReleaseConfidence,
109
102
  bootstrapCi,
@@ -111,38 +104,52 @@ import {
111
104
  judgeReplayGate,
112
105
  releaseTraceEvidenceFromMultiShotTrials,
113
106
  renderReleaseReport
114
- } from "./chunk-WGXZAQLR.js";
107
+ } from "./chunk-LGAPK7NA.js";
115
108
  import {
116
109
  runEvalCampaign
117
- } from "./chunk-DTEJNZYK.js";
110
+ } from "./chunk-SQYRO3BT.js";
118
111
  import {
119
112
  LlmCallError,
120
113
  LlmClient,
121
114
  LlmRouteAssertionError,
122
115
  assertLlmRoute,
116
+ backoffMs,
123
117
  callLlm,
124
118
  callLlmJson,
119
+ isTransientLlmError,
125
120
  probeLlm,
126
121
  stripFencedJson
127
- } from "./chunk-M6RZ5LJN.js";
122
+ } from "./chunk-VXNVVBZO.js";
123
+ import {
124
+ AgentProfileCellValidationError,
125
+ RunRecordValidationError,
126
+ agentProfileCellHashMaterial,
127
+ agentProfileCellKey,
128
+ assertRunAgentProfileCell,
129
+ buildAgentProfileCell,
130
+ groupRunsByAgentProfileCell,
131
+ isRunRecord,
132
+ parseRunRecordSafe,
133
+ requireAgentProfileCell,
134
+ roundTripRunRecord,
135
+ validateAgentProfileCell,
136
+ validateRunRecord,
137
+ verifyAgentProfileCell
138
+ } from "./chunk-DCZXFOQN.js";
128
139
  import {
129
140
  evaluateInterimReleaseConfidence,
130
141
  pairedEvalueSequence
131
142
  } from "./chunk-MAZ26DC7.js";
132
143
  import {
133
144
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
134
- benjaminiHochberg,
135
- bhAdjust,
136
- bonferroni,
137
145
  gainHistogram,
138
- pairedBootstrap,
139
- pairedWilcoxon,
140
146
  paretoChart,
141
- requiredSampleSize,
142
147
  researchReport,
143
148
  summaryTable
144
- } from "./chunk-CXJOVDJR.js";
149
+ } from "./chunk-TQL7BAOY.js";
145
150
  import {
151
+ benjaminiHochberg,
152
+ bonferroni,
146
153
  calibrateJudge,
147
154
  calibrateJudgeContinuous,
148
155
  cohensD,
@@ -153,14 +160,17 @@ import {
153
160
  interRaterReliability,
154
161
  mannWhitneyU,
155
162
  normalizeScores,
163
+ pairedBootstrap,
164
+ pairedMde,
156
165
  pairedTTest,
157
166
  partialCredit,
158
167
  positionalBias,
168
+ requiredSampleSize,
159
169
  selfPreference,
160
170
  verbosityBias,
161
171
  weightedMean,
162
172
  wilcoxonSignedRank
163
- } from "./chunk-4L3WJXQJ.js";
173
+ } from "./chunk-KHZRNY3F.js";
164
174
  import {
165
175
  DEFAULT_REDACTION_RULES,
166
176
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -2991,29 +3001,13 @@ var AgentDriver = class {
2991
3001
  }
2992
3002
  /** Use the driver LLM to decide what the "user" says next */
2993
3003
  async decideNextMessage(persona, state, history) {
2994
- const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
2995
- const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
2996
- const resp = await this.tc.chat({
2997
- model: this.driverModel,
2998
- messages: [
2999
- {
3000
- role: "system",
3001
- content: buildDriverSystemPrompt(persona, state, this.productContext)
3002
- },
3003
- {
3004
- role: "user",
3005
- content: recentHistory ? `Recent conversation:
3006
- ${recentHistory}
3007
-
3008
- The agent's latest response:
3009
- ${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
3010
- }
3011
- ],
3012
- temperature: 0.5,
3013
- maxTokens: 700
3004
+ return decideNextUserTurn(this.tc, {
3005
+ persona,
3006
+ state,
3007
+ history,
3008
+ productContext: this.productContext,
3009
+ model: this.driverModel
3014
3010
  });
3015
- const content = resp.choices?.[0]?.message?.content ?? "";
3016
- return content.trim();
3017
3011
  }
3018
3012
  /** Handle pending approvals based on persona feedback patterns */
3019
3013
  async handleApprovals(persona, workspaceId, _state) {
@@ -3081,6 +3075,29 @@ Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on t
3081
3075
 
3082
3076
  Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
3083
3077
  }
3078
+ async function decideNextUserTurn(tc, opts) {
3079
+ const { persona, state, history, productContext = "", model = "claude-sonnet-4-6" } = opts;
3080
+ const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
3081
+ const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
3082
+ const resp = await tc.chat({
3083
+ model,
3084
+ messages: [
3085
+ { role: "system", content: buildDriverSystemPrompt(persona, state, productContext) },
3086
+ {
3087
+ role: "user",
3088
+ content: recentHistory ? `Recent conversation:
3089
+ ${recentHistory}
3090
+
3091
+ The agent's latest response:
3092
+ ${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
3093
+ }
3094
+ ],
3095
+ temperature: 0.5,
3096
+ maxTokens: 700
3097
+ });
3098
+ const content = resp.choices?.[0]?.message?.content ?? "";
3099
+ return content.trim();
3100
+ }
3084
3101
 
3085
3102
  // src/integration-gates.ts
3086
3103
  function integrationManifestValidatedPayload(input) {
@@ -10309,35 +10326,14 @@ var JsonlTrialCache = class {
10309
10326
  // src/judge-retry.ts
10310
10327
  var DEFAULT_MAX_ATTEMPTS = 3;
10311
10328
  var DEFAULT_TIMEOUT_MS = 9e4;
10312
- var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
10313
- var ABORT_PATTERNS = [
10314
- /AbortError/i,
10315
- /TimeoutError/i,
10316
- /fetch failed/i,
10317
- /ECONNRESET/i,
10318
- /ETIMEDOUT/i,
10319
- /EAI_AGAIN/i,
10320
- /this operation was aborted/i,
10321
- /stream.*ended.*unexpectedly/i,
10322
- /socket hang up/i
10323
- ];
10324
- var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
10325
- function defaultIsRetryable(err) {
10326
- if (err instanceof Error) {
10327
- if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
10328
- const status = err.status;
10329
- if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
10330
- }
10331
- return false;
10332
- }
10333
10329
  function sleep(ms) {
10334
10330
  return new Promise((resolve) => setTimeout(resolve, ms));
10335
10331
  }
10336
10332
  async function withJudgeRetry(judgeFn, policy = {}) {
10337
10333
  const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
10338
10334
  const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
10339
- const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
10340
- const isRetryable = policy.isRetryable ?? defaultIsRetryable;
10335
+ const backoff = policy.backoffMs ?? backoffMs;
10336
+ const isRetryable = policy.isRetryable ?? isTransientLlmError;
10341
10337
  const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
10342
10338
  let totalAttempts = 0;
10343
10339
  const attemptErrors = [];
@@ -10660,6 +10656,7 @@ export {
10660
10656
  ANALYST_SEVERITIES,
10661
10657
  AgentDriver,
10662
10658
  AgentEvalError,
10659
+ AgentProfileCellValidationError,
10663
10660
  AnalystRegistry,
10664
10661
  AxGepaSteeringOptimizer,
10665
10662
  BENCHMARK_SPLIT_SEED,
@@ -10767,6 +10764,8 @@ export {
10767
10764
  VerificationError,
10768
10765
  acquisitionPlansForKnowledgeGaps,
10769
10766
  adversarialJudge,
10767
+ agentProfileCellHashMaterial,
10768
+ agentProfileCellKey,
10770
10769
  aggregateLlm,
10771
10770
  aggregateRunScore,
10772
10771
  aggregateTrialsByMode,
@@ -10778,17 +10777,19 @@ export {
10778
10777
  assertLlmRoute,
10779
10778
  assertRealBackend,
10780
10779
  assertReleaseConfidence,
10780
+ assertRunAgentProfileCell,
10781
10781
  assertRunCaptured,
10782
10782
  assignFeedbackSplit,
10783
10783
  attributeCounterfactuals,
10784
+ backoffMs,
10784
10785
  deterministicSplit as benchmarkDeterministicSplit,
10785
10786
  benchmarks_exports as benchmarks,
10786
10787
  benjaminiHochberg,
10787
- bhAdjust,
10788
10788
  bisect,
10789
10789
  blockingKnowledgeEval,
10790
10790
  bonferroni,
10791
10791
  bootstrapCi,
10792
+ buildAgentProfileCell,
10792
10793
  buildDriverSystemPrompt,
10793
10794
  buildReflectionPrompt,
10794
10795
  buildReviewerPrompt,
@@ -10853,6 +10854,7 @@ export {
10853
10854
  createVerifierAdapter,
10854
10855
  crossTraceDiff,
10855
10856
  crowdingDistance,
10857
+ decideNextUserTurn,
10856
10858
  decideReferenceReplayPromotion,
10857
10859
  decideReferenceReplayRunPromotion,
10858
10860
  defaultIsMaterial,
@@ -10904,6 +10906,7 @@ export {
10904
10906
  precision as goldenPrecision,
10905
10907
  gradeSemanticStatus,
10906
10908
  groupBy,
10909
+ groupRunsByAgentProfileCell,
10907
10910
  hashContent,
10908
10911
  hashJson,
10909
10912
  hashScenarios,
@@ -10925,6 +10928,7 @@ export {
10925
10928
  isRunRecord,
10926
10929
  isSandboxSpan,
10927
10930
  isToolSpan,
10931
+ isTransientLlmError,
10928
10932
  iterateRawCalls,
10929
10933
  jestTestParser,
10930
10934
  jsonHasKeys,
@@ -10954,8 +10958,8 @@ export {
10954
10958
  objectiveEval,
10955
10959
  pairedBootstrap,
10956
10960
  pairedEvalueSequence,
10961
+ pairedMde,
10957
10962
  pairedTTest,
10958
- pairedWilcoxon,
10959
10963
  paraphraseRobustness,
10960
10964
  paraphraseRobustnessScenarios,
10961
10965
  paretoChart,
@@ -11001,6 +11005,7 @@ export {
11001
11005
  replayFeedbackTrajectory,
11002
11006
  replayScorerOverCorpus,
11003
11007
  replayTraceThroughJudge,
11008
+ requireAgentProfileCell,
11004
11009
  requiredSampleSize,
11005
11010
  researchReport,
11006
11011
  resetLockedAppendersForTesting,
@@ -11071,8 +11076,10 @@ export {
11071
11076
  typoMutator,
11072
11077
  urlContains,
11073
11078
  userQuestionsForKnowledgeGaps,
11079
+ validateAgentProfileCell,
11074
11080
  validateRunRecord,
11075
11081
  verbosityBias,
11082
+ verifyAgentProfileCell,
11076
11083
  verifyCompletion,
11077
11084
  verifyManifest,
11078
11085
  visualDiff,