@tangle-network/agent-eval 0.33.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/benchmarks/index.d.ts +2 -2
  2. package/dist/chunk-DCZXFOQN.js +489 -0
  3. package/dist/chunk-DCZXFOQN.js.map +1 -0
  4. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  5. package/dist/chunk-FT3IAMQR.js.map +1 -0
  6. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  7. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  8. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  9. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  10. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  11. package/dist/chunk-SQYRO3BT.js.map +1 -0
  12. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  13. package/dist/chunk-TQL7BAOY.js.map +1 -0
  14. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  15. package/dist/chunk-VXNVVBZO.js.map +1 -0
  16. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  17. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  18. package/dist/cli.js +2 -2
  19. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  20. package/dist/control.d.ts +2 -2
  21. package/dist/control.js +3 -2
  22. package/dist/governance/index.d.ts +2 -1
  23. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  24. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  25. package/dist/index.d.ts +18 -486
  26. package/dist/index.js +45 -46
  27. package/dist/index.js.map +1 -1
  28. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  29. package/dist/meta-eval/index.d.ts +2 -2
  30. package/dist/openapi.json +1 -1
  31. package/dist/optimization.d.ts +3 -3
  32. package/dist/optimization.js +6 -6
  33. package/dist/pipelines/index.js +2 -2
  34. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  35. package/dist/reporting.d.ts +6 -4
  36. package/dist/reporting.js +10 -9
  37. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  38. package/dist/rl.d.ts +5 -5
  39. package/dist/rl.js +6 -6
  40. package/dist/rl.js.map +1 -1
  41. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  42. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  43. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  44. package/dist/wire/index.js +2 -2
  45. package/docs/product-eval-adoption.md +18 -0
  46. package/package.json +1 -1
  47. package/dist/chunk-B73G44OH.js.map +0 -1
  48. package/dist/chunk-CXJOVDJR.js.map +0 -1
  49. package/dist/chunk-DTEJNZYK.js.map +0 -1
  50. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  51. package/dist/chunk-ZN2CMQIW.js +0 -208
  52. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  53. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  54. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  55. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  56. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  57. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-GVQT44CS.js";
14
+ } from "./chunk-KE7TDJUO.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-S4Y5VXMS.js";
57
+ } from "./chunk-WRGHMGWT.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,14 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-B73G44OH.js";
100
- import {
101
- RunRecordValidationError,
102
- isRunRecord,
103
- parseRunRecordSafe,
104
- roundTripRunRecord,
105
- validateRunRecord
106
- } from "./chunk-ZN2CMQIW.js";
99
+ } from "./chunk-FT3IAMQR.js";
107
100
  import {
108
101
  assertReleaseConfidence,
109
102
  bootstrapCi,
@@ -111,38 +104,52 @@ import {
111
104
  judgeReplayGate,
112
105
  releaseTraceEvidenceFromMultiShotTrials,
113
106
  renderReleaseReport
114
- } from "./chunk-WGXZAQLR.js";
107
+ } from "./chunk-LGAPK7NA.js";
115
108
  import {
116
109
  runEvalCampaign
117
- } from "./chunk-DTEJNZYK.js";
110
+ } from "./chunk-SQYRO3BT.js";
118
111
  import {
119
112
  LlmCallError,
120
113
  LlmClient,
121
114
  LlmRouteAssertionError,
122
115
  assertLlmRoute,
116
+ backoffMs,
123
117
  callLlm,
124
118
  callLlmJson,
119
+ isTransientLlmError,
125
120
  probeLlm,
126
121
  stripFencedJson
127
- } from "./chunk-M6RZ5LJN.js";
122
+ } from "./chunk-VXNVVBZO.js";
123
+ import {
124
+ AgentProfileCellValidationError,
125
+ RunRecordValidationError,
126
+ agentProfileCellHashMaterial,
127
+ agentProfileCellKey,
128
+ assertRunAgentProfileCell,
129
+ buildAgentProfileCell,
130
+ groupRunsByAgentProfileCell,
131
+ isRunRecord,
132
+ parseRunRecordSafe,
133
+ requireAgentProfileCell,
134
+ roundTripRunRecord,
135
+ validateAgentProfileCell,
136
+ validateRunRecord,
137
+ verifyAgentProfileCell
138
+ } from "./chunk-DCZXFOQN.js";
128
139
  import {
129
140
  evaluateInterimReleaseConfidence,
130
141
  pairedEvalueSequence
131
142
  } from "./chunk-MAZ26DC7.js";
132
143
  import {
133
144
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
134
- benjaminiHochberg,
135
- bhAdjust,
136
- bonferroni,
137
145
  gainHistogram,
138
- pairedBootstrap,
139
- pairedWilcoxon,
140
146
  paretoChart,
141
- requiredSampleSize,
142
147
  researchReport,
143
148
  summaryTable
144
- } from "./chunk-CXJOVDJR.js";
149
+ } from "./chunk-TQL7BAOY.js";
145
150
  import {
151
+ benjaminiHochberg,
152
+ bonferroni,
146
153
  calibrateJudge,
147
154
  calibrateJudgeContinuous,
148
155
  cohensD,
@@ -153,14 +160,17 @@ import {
153
160
  interRaterReliability,
154
161
  mannWhitneyU,
155
162
  normalizeScores,
163
+ pairedBootstrap,
164
+ pairedMde,
156
165
  pairedTTest,
157
166
  partialCredit,
158
167
  positionalBias,
168
+ requiredSampleSize,
159
169
  selfPreference,
160
170
  verbosityBias,
161
171
  weightedMean,
162
172
  wilcoxonSignedRank
163
- } from "./chunk-4L3WJXQJ.js";
173
+ } from "./chunk-KHZRNY3F.js";
164
174
  import {
165
175
  DEFAULT_REDACTION_RULES,
166
176
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -10316,35 +10326,14 @@ var JsonlTrialCache = class {
10316
10326
  // src/judge-retry.ts
10317
10327
  var DEFAULT_MAX_ATTEMPTS = 3;
10318
10328
  var DEFAULT_TIMEOUT_MS = 9e4;
10319
- var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
10320
- var ABORT_PATTERNS = [
10321
- /AbortError/i,
10322
- /TimeoutError/i,
10323
- /fetch failed/i,
10324
- /ECONNRESET/i,
10325
- /ETIMEDOUT/i,
10326
- /EAI_AGAIN/i,
10327
- /this operation was aborted/i,
10328
- /stream.*ended.*unexpectedly/i,
10329
- /socket hang up/i
10330
- ];
10331
- var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
10332
- function defaultIsRetryable(err) {
10333
- if (err instanceof Error) {
10334
- if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
10335
- const status = err.status;
10336
- if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
10337
- }
10338
- return false;
10339
- }
10340
10329
  function sleep(ms) {
10341
10330
  return new Promise((resolve) => setTimeout(resolve, ms));
10342
10331
  }
10343
10332
  async function withJudgeRetry(judgeFn, policy = {}) {
10344
10333
  const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
10345
10334
  const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
10346
- const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
10347
- const isRetryable = policy.isRetryable ?? defaultIsRetryable;
10335
+ const backoff = policy.backoffMs ?? backoffMs;
10336
+ const isRetryable = policy.isRetryable ?? isTransientLlmError;
10348
10337
  const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
10349
10338
  let totalAttempts = 0;
10350
10339
  const attemptErrors = [];
@@ -10667,6 +10656,7 @@ export {
10667
10656
  ANALYST_SEVERITIES,
10668
10657
  AgentDriver,
10669
10658
  AgentEvalError,
10659
+ AgentProfileCellValidationError,
10670
10660
  AnalystRegistry,
10671
10661
  AxGepaSteeringOptimizer,
10672
10662
  BENCHMARK_SPLIT_SEED,
@@ -10774,6 +10764,8 @@ export {
10774
10764
  VerificationError,
10775
10765
  acquisitionPlansForKnowledgeGaps,
10776
10766
  adversarialJudge,
10767
+ agentProfileCellHashMaterial,
10768
+ agentProfileCellKey,
10777
10769
  aggregateLlm,
10778
10770
  aggregateRunScore,
10779
10771
  aggregateTrialsByMode,
@@ -10785,17 +10777,19 @@ export {
10785
10777
  assertLlmRoute,
10786
10778
  assertRealBackend,
10787
10779
  assertReleaseConfidence,
10780
+ assertRunAgentProfileCell,
10788
10781
  assertRunCaptured,
10789
10782
  assignFeedbackSplit,
10790
10783
  attributeCounterfactuals,
10784
+ backoffMs,
10791
10785
  deterministicSplit as benchmarkDeterministicSplit,
10792
10786
  benchmarks_exports as benchmarks,
10793
10787
  benjaminiHochberg,
10794
- bhAdjust,
10795
10788
  bisect,
10796
10789
  blockingKnowledgeEval,
10797
10790
  bonferroni,
10798
10791
  bootstrapCi,
10792
+ buildAgentProfileCell,
10799
10793
  buildDriverSystemPrompt,
10800
10794
  buildReflectionPrompt,
10801
10795
  buildReviewerPrompt,
@@ -10912,6 +10906,7 @@ export {
10912
10906
  precision as goldenPrecision,
10913
10907
  gradeSemanticStatus,
10914
10908
  groupBy,
10909
+ groupRunsByAgentProfileCell,
10915
10910
  hashContent,
10916
10911
  hashJson,
10917
10912
  hashScenarios,
@@ -10933,6 +10928,7 @@ export {
10933
10928
  isRunRecord,
10934
10929
  isSandboxSpan,
10935
10930
  isToolSpan,
10931
+ isTransientLlmError,
10936
10932
  iterateRawCalls,
10937
10933
  jestTestParser,
10938
10934
  jsonHasKeys,
@@ -10962,8 +10958,8 @@ export {
10962
10958
  objectiveEval,
10963
10959
  pairedBootstrap,
10964
10960
  pairedEvalueSequence,
10961
+ pairedMde,
10965
10962
  pairedTTest,
10966
- pairedWilcoxon,
10967
10963
  paraphraseRobustness,
10968
10964
  paraphraseRobustnessScenarios,
10969
10965
  paretoChart,
@@ -11009,6 +11005,7 @@ export {
11009
11005
  replayFeedbackTrajectory,
11010
11006
  replayScorerOverCorpus,
11011
11007
  replayTraceThroughJudge,
11008
+ requireAgentProfileCell,
11012
11009
  requiredSampleSize,
11013
11010
  researchReport,
11014
11011
  resetLockedAppendersForTesting,
@@ -11079,8 +11076,10 @@ export {
11079
11076
  typoMutator,
11080
11077
  urlContains,
11081
11078
  userQuestionsForKnowledgeGaps,
11079
+ validateAgentProfileCell,
11082
11080
  validateRunRecord,
11083
11081
  verbosityBias,
11082
+ verifyAgentProfileCell,
11084
11083
  verifyCompletion,
11085
11084
  verifyManifest,
11086
11085
  visualDiff,