@tangle-network/agent-eval 0.25.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +5 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-EDUKQ5AM.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-Oj9fAPPN.d.ts → index-BhLlu-qO.d.ts} +63 -2
  38. package/dist/index.d.ts +279 -72
  39. package/dist/index.js +222 -136
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-BNgMdqPF.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-BPT8x_NT.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-Dl4akLKX.d.ts} +13 -1
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/concepts.md +11 -0
  64. package/docs/research-report-methodology.md +4 -4
  65. package/docs/three-package-architecture.md +12 -24
  66. package/package.json +1 -1
  67. package/dist/chunk-2A5XJB43.js.map +0 -1
  68. package/dist/chunk-4F5DQN55.js.map +0 -1
  69. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  70. package/dist/chunk-EDUKQ5AM.js.map +0 -1
  71. package/dist/chunk-I4MBDTY5.js +0 -272
  72. package/dist/chunk-I4MBDTY5.js.map +0 -1
  73. package/dist/chunk-JLZQWFV3.js.map +0 -1
  74. package/dist/chunk-K2TPS5LB.js.map +0 -1
  75. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  76. package/dist/chunk-NU65VQ7M.js.map +0 -1
  77. package/dist/chunk-OWLAAMME.js.map +0 -1
  78. package/dist/chunk-SESZDQPX.js.map +0 -1
  79. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  80. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  81. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-JLZQWFV3.js";
14
+ } from "./chunk-K33INZHH.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -28,7 +28,7 @@ import {
28
28
  pytestTestParser,
29
29
  runTestGradedScenario,
30
30
  vitestTestParser
31
- } from "./chunk-OWLAAMME.js";
31
+ } from "./chunk-QHF6EQKK.js";
32
32
  import {
33
33
  classifyEuAiRisk,
34
34
  euAiActReport,
@@ -43,7 +43,7 @@ import {
43
43
  knowledgeReadinessTracePayload,
44
44
  scoreKnowledgeReadiness,
45
45
  userQuestionsForKnowledgeGaps
46
- } from "./chunk-WWYCWKUM.js";
46
+ } from "./chunk-3CKU6VGU.js";
47
47
  import {
48
48
  controlFailureClassFromVerification,
49
49
  controlRunToRunRecord,
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-ZN274SWR.js";
57
+ } from "./chunk-PALJO75S.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -62,7 +62,7 @@ import {
62
62
  stopOnNoProgress,
63
63
  stopOnRepeatedAction,
64
64
  subjectiveEval
65
- } from "./chunk-LSH4MMOZ.js";
65
+ } from "./chunk-NCRFYPS3.js";
66
66
  import {
67
67
  CallbackResearcher,
68
68
  DEFAULT_MUTATION_PRIMITIVES,
@@ -96,7 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-EDUKQ5AM.js";
99
+ } from "./chunk-SZSBQUIJ.js";
100
100
  import {
101
101
  RunRecordValidationError,
102
102
  isRunRecord,
@@ -111,10 +111,10 @@ import {
111
111
  judgeReplayGate,
112
112
  releaseTraceEvidenceFromMultiShotTrials,
113
113
  renderReleaseReport
114
- } from "./chunk-RAF443UI.js";
114
+ } from "./chunk-DBIGN5MJ.js";
115
115
  import {
116
116
  runEvalCampaign
117
- } from "./chunk-SESZDQPX.js";
117
+ } from "./chunk-RUI6SIHY.js";
118
118
  import {
119
119
  LlmCallError,
120
120
  LlmClient,
@@ -128,7 +128,7 @@ import {
128
128
  import {
129
129
  evaluateInterimReleaseConfidence,
130
130
  pairedEvalueSequence
131
- } from "./chunk-NU65VQ7M.js";
131
+ } from "./chunk-MAZ26DC7.js";
132
132
  import {
133
133
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
134
134
  benjaminiHochberg,
@@ -141,18 +141,26 @@ import {
141
141
  requiredSampleSize,
142
142
  researchReport,
143
143
  summaryTable
144
- } from "./chunk-2A5XJB43.js";
144
+ } from "./chunk-5AKPEK5L.js";
145
145
  import {
146
+ calibrateJudge,
147
+ calibrateJudgeContinuous,
146
148
  cohensD,
147
149
  confidenceInterval,
150
+ continuousAgreement,
151
+ corpusInterRaterAgreement,
152
+ corpusInterRaterAgreementFromJudgeScores,
148
153
  interRaterReliability,
149
154
  mannWhitneyU,
150
155
  normalizeScores,
151
156
  pairedTTest,
152
157
  partialCredit,
158
+ positionalBias,
159
+ selfPreference,
160
+ verbosityBias,
153
161
  weightedMean,
154
162
  wilcoxonSignedRank
155
- } from "./chunk-I4MBDTY5.js";
163
+ } from "./chunk-R5UQJNKC.js";
156
164
  import {
157
165
  DEFAULT_REDACTION_RULES,
158
166
  FileSystemTraceStore,
@@ -166,7 +174,7 @@ import {
166
174
  iterateRawCalls,
167
175
  redactString,
168
176
  redactValue
169
- } from "./chunk-K2TPS5LB.js";
177
+ } from "./chunk-4U4BKCXK.js";
170
178
  import {
171
179
  aggregateLlm,
172
180
  argHash,
@@ -208,7 +216,7 @@ import {
208
216
  hashJson,
209
217
  signManifest,
210
218
  verifyManifest
211
- } from "./chunk-4F5DQN55.js";
219
+ } from "./chunk-VSMTAMNK.js";
212
220
  import {
213
221
  AgentEvalError,
214
222
  CaptureIntegrityError,
@@ -425,12 +433,12 @@ function ghCliClient(opts = {}) {
425
433
  await exec("git", ["branch", "-D", input.branchName], { cwd });
426
434
  await run("git", ["checkout", "-b", input.branchName]);
427
435
  const { mkdir, writeFile } = await import("fs/promises");
428
- const { dirname: dirname5, join: join3, resolve } = await import("path");
436
+ const { dirname: dirname5, join: join4, resolve } = await import("path");
429
437
  for (const change of input.fileChanges) {
430
438
  const abs = resolve(cwd, change.path);
431
439
  await mkdir(dirname5(abs), { recursive: true });
432
440
  await writeFile(abs, change.contents, "utf8");
433
- await run("git", ["add", join3(change.path)]);
441
+ await run("git", ["add", join4(change.path)]);
434
442
  }
435
443
  const env = {};
436
444
  if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
@@ -3073,36 +3081,36 @@ var FileSystemExperimentStore = class {
3073
3081
  return idx.listRuns(experimentId);
3074
3082
  }
3075
3083
  async ensureDir() {
3076
- const fs = await import("fs/promises");
3077
- await fs.mkdir(this.dir, { recursive: true });
3084
+ const fs2 = await import("fs/promises");
3085
+ await fs2.mkdir(this.dir, { recursive: true });
3078
3086
  }
3079
3087
  async append(name, record) {
3080
3088
  await this.ensureDir();
3081
- const fs = await import("fs/promises");
3089
+ const fs2 = await import("fs/promises");
3082
3090
  const path = await import("path");
3083
3091
  const active = path.join(this.dir, `${name}.ndjson`);
3084
3092
  try {
3085
- const stat = await fs.stat(active);
3093
+ const stat = await fs2.stat(active);
3086
3094
  if (stat.size >= this.maxBytes) {
3087
3095
  const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
3088
- await fs.rename(active, rolled);
3096
+ await fs2.rename(active, rolled);
3089
3097
  }
3090
3098
  } catch {
3091
3099
  }
3092
- await fs.appendFile(active, `${JSON.stringify(record)}
3100
+ await fs2.appendFile(active, `${JSON.stringify(record)}
3093
3101
  `, "utf8");
3094
3102
  }
3095
3103
  async load() {
3096
3104
  if (this.loaded && this.index) return this.index;
3097
- const fs = await import("fs/promises");
3105
+ const fs2 = await import("fs/promises");
3098
3106
  const path = await import("path");
3099
3107
  const store = new InMemoryExperimentStore();
3100
3108
  try {
3101
- const entries = await fs.readdir(this.dir);
3109
+ const entries = await fs2.readdir(this.dir);
3102
3110
  const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
3103
3111
  for (const file of sorted) {
3104
3112
  const full = path.join(this.dir, file);
3105
- const content = await fs.readFile(full, "utf8");
3113
+ const content = await fs2.readFile(full, "utf8");
3106
3114
  const base = file.split(".")[0];
3107
3115
  for (const line of content.split("\n")) {
3108
3116
  if (!line.trim()) continue;
@@ -4956,114 +4964,6 @@ function seededShuffle(items, seed) {
4956
4964
  return out;
4957
4965
  }
4958
4966
 
4959
- // src/judge-calibration.ts
4960
- function calibrateJudge(golden, candidate) {
4961
- const map = /* @__PURE__ */ new Map();
4962
- for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
4963
- for (const c of candidate) {
4964
- const entry = map.get(c.itemId);
4965
- if (entry) entry.j = c.score;
4966
- }
4967
- const common = [...map.values()].filter((v) => Number.isFinite(v.j));
4968
- const n = common.length;
4969
- if (n < 2) {
4970
- return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
4971
- }
4972
- const humans = common.map((c) => c.h);
4973
- const judges = common.map((c) => c.j);
4974
- const pearson = pearsonR(humans, judges);
4975
- const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
4976
- const absDiffs = common.map((c) => Math.abs(c.j - c.h));
4977
- const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
4978
- const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
4979
- return { n, pearson, kappa, mae, worstItems: worst2 };
4980
- }
4981
- function positionalBias(scores) {
4982
- const pairs = /* @__PURE__ */ new Map();
4983
- for (const s of scores) {
4984
- const slot = pairs.get(s.itemId) ?? {};
4985
- if (s.positionOfAInput === "first") slot.first = s.score;
4986
- else if (s.positionOfAInput === "second") slot.second = s.score;
4987
- pairs.set(s.itemId, slot);
4988
- }
4989
- const deltas = [];
4990
- for (const { first, second } of pairs.values()) {
4991
- if (first !== void 0 && second !== void 0) deltas.push(first - second);
4992
- }
4993
- if (deltas.length === 0) return { avgDelta: 0, n: 0 };
4994
- return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
4995
- }
4996
- function verbosityBias(samples) {
4997
- const n = samples.length;
4998
- if (n < 3) return { pearson: NaN, n };
4999
- return {
5000
- pearson: pearsonR(
5001
- samples.map((s) => s.outputLen),
5002
- samples.map((s) => s.score)
5003
- ),
5004
- n
5005
- };
5006
- }
5007
- function selfPreference(samples) {
5008
- const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
5009
- const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
5010
- if (inF.length === 0 || outF.length === 0)
5011
- return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
5012
- const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
5013
- const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
5014
- return {
5015
- inFamilyMean: inMean,
5016
- outOfFamilyMean: outMean,
5017
- deltaMean: inMean - outMean,
5018
- n: samples.length
5019
- };
5020
- }
5021
- function pearsonR(a, b) {
5022
- if (a.length !== b.length || a.length < 2) return NaN;
5023
- const mA = a.reduce((s, v) => s + v, 0) / a.length;
5024
- const mB = b.reduce((s, v) => s + v, 0) / b.length;
5025
- let num = 0, dA = 0, dB = 0;
5026
- for (let i = 0; i < a.length; i++) {
5027
- const da = a[i] - mA;
5028
- const db = b[i] - mB;
5029
- num += da * db;
5030
- dA += da * da;
5031
- dB += db * db;
5032
- }
5033
- if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
5034
- return num / Math.sqrt(dA * dB);
5035
- }
5036
- function weightedKappa(a, b) {
5037
- if (a.length !== b.length || a.length === 0) return NaN;
5038
- const min = Math.min(...a, ...b);
5039
- const max = Math.max(...a, ...b);
5040
- const K = max - min + 1;
5041
- if (K < 2) return 1;
5042
- const observed = Array.from({ length: K }, () => new Array(K).fill(0));
5043
- const rowMarg = new Array(K).fill(0);
5044
- const colMarg = new Array(K).fill(0);
5045
- for (let i = 0; i < a.length; i++) {
5046
- const ai = a[i] - min;
5047
- const bi = b[i] - min;
5048
- const row = observed[ai];
5049
- row[bi] = (row[bi] ?? 0) + 1;
5050
- rowMarg[ai]++;
5051
- colMarg[bi]++;
5052
- }
5053
- let num = 0;
5054
- let den = 0;
5055
- for (let i = 0; i < K; i++) {
5056
- for (let j = 0; j < K; j++) {
5057
- const w = (i - j) ** 2 / (K - 1) ** 2;
5058
- const expected = rowMarg[i] * colMarg[j] / a.length;
5059
- num += w * observed[i][j];
5060
- den += w * expected;
5061
- }
5062
- }
5063
- if (den === 0) return 1;
5064
- return 1 - num / den;
5065
- }
5066
-
5067
4967
  // src/observability.ts
5068
4968
  async function toLangfuseEnvelope(store, runId) {
5069
4969
  const run = await store.getRun(runId);
@@ -5564,7 +5464,7 @@ async function commitBisect(options) {
5564
5464
  }
5565
5465
  async function promptBisect(options) {
5566
5466
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
5567
- const join3 = (paragraphs) => paragraphs.join("\n\n");
5467
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
5568
5468
  const goodParas = split(options.good);
5569
5469
  const badParas = split(options.bad);
5570
5470
  if (goodParas.length !== badParas.length) {
@@ -5584,7 +5484,7 @@ async function promptBisect(options) {
5584
5484
  const result = await bisect({
5585
5485
  good: goodMask,
5586
5486
  bad: badMask,
5587
- runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
5487
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
5588
5488
  maxIterations: options.maxIterations ?? n + 5,
5589
5489
  halfway: (g, b) => {
5590
5490
  for (let i = 0; i < g.length; i++) {
@@ -5615,12 +5515,12 @@ async function promptBisect(options) {
5615
5515
  }
5616
5516
  }
5617
5517
  const materializedPath = result.path.map((s) => ({
5618
- state: join3(paragraphsFor(s.state)),
5518
+ state: join4(paragraphsFor(s.state)),
5619
5519
  score: s.score,
5620
5520
  pass: s.pass
5621
5521
  }));
5622
5522
  return {
5623
- culprit: join3(paragraphsFor(culprit)),
5523
+ culprit: join4(paragraphsFor(culprit)),
5624
5524
  path: materializedPath,
5625
5525
  converged: result.converged,
5626
5526
  inputInconsistent: result.inputInconsistent,
@@ -8308,6 +8208,52 @@ function createCompositeMutator(opts) {
8308
8208
  };
8309
8209
  }
8310
8210
 
8211
+ // src/discover-personas.ts
8212
+ import { promises as fs } from "fs";
8213
+ import { basename, extname, join as join3 } from "path";
8214
+ var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
8215
+ async function discoverPersonas(dir, opts = {}) {
8216
+ const pattern = opts.pattern ?? DEFAULT_PATTERN;
8217
+ const exclude = new Set(opts.exclude ?? []);
8218
+ const include = opts.include;
8219
+ async function walk(d) {
8220
+ let entries;
8221
+ try {
8222
+ const raw = await fs.readdir(d, { withFileTypes: true });
8223
+ entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
8224
+ } catch (err) {
8225
+ const code = err.code;
8226
+ if (code === "ENOENT") return [];
8227
+ throw err;
8228
+ }
8229
+ const out = [];
8230
+ for (const entry of entries) {
8231
+ const full = join3(d, entry.name);
8232
+ if (entry.isDir) {
8233
+ if (opts.recursive) out.push(...await walk(full));
8234
+ continue;
8235
+ }
8236
+ if (!pattern.test(entry.name)) continue;
8237
+ if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
8238
+ continue;
8239
+ if (include && include.length > 0) {
8240
+ const id = basename(entry.name, extname(entry.name));
8241
+ const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
8242
+ if (!matched) continue;
8243
+ }
8244
+ out.push({
8245
+ path: full,
8246
+ filename: entry.name,
8247
+ id: basename(entry.name, extname(entry.name))
8248
+ });
8249
+ }
8250
+ return out;
8251
+ }
8252
+ const results = await walk(dir);
8253
+ results.sort((a, b) => a.filename.localeCompare(b.filename));
8254
+ return results;
8255
+ }
8256
+
8311
8257
  // src/evolution-telemetry.ts
8312
8258
  import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
8313
8259
  import { dirname as dirname3 } from "path";
@@ -8697,6 +8643,90 @@ var JsonlTrialCache = class {
8697
8643
  }
8698
8644
  };
8699
8645
 
8646
+ // src/judge-retry.ts
8647
+ var DEFAULT_MAX_ATTEMPTS = 3;
8648
+ var DEFAULT_TIMEOUT_MS = 9e4;
8649
+ var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
8650
+ var ABORT_PATTERNS = [
8651
+ /AbortError/i,
8652
+ /TimeoutError/i,
8653
+ /fetch failed/i,
8654
+ /ECONNRESET/i,
8655
+ /ETIMEDOUT/i,
8656
+ /EAI_AGAIN/i,
8657
+ /this operation was aborted/i,
8658
+ /stream.*ended.*unexpectedly/i,
8659
+ /socket hang up/i
8660
+ ];
8661
+ var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8662
+ function defaultIsRetryable(err) {
8663
+ if (err instanceof Error) {
8664
+ if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
8665
+ const status = err.status;
8666
+ if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
8667
+ }
8668
+ return false;
8669
+ }
8670
+ function sleep(ms) {
8671
+ return new Promise((resolve) => setTimeout(resolve, ms));
8672
+ }
8673
+ async function withJudgeRetry(judgeFn, policy = {}) {
8674
+ const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
8675
+ const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
8676
+ const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
8677
+ const isRetryable = policy.isRetryable ?? defaultIsRetryable;
8678
+ const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
8679
+ let totalAttempts = 0;
8680
+ const attemptErrors = [];
8681
+ let lastError;
8682
+ for (const model of models) {
8683
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
8684
+ totalAttempts += 1;
8685
+ const controller = new AbortController();
8686
+ const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
8687
+ try {
8688
+ const value = await judgeFn(model, controller.signal);
8689
+ clearTimeout(timer);
8690
+ return {
8691
+ value,
8692
+ succeeded: true,
8693
+ attempts: totalAttempts,
8694
+ modelUsed: model,
8695
+ attemptErrors
8696
+ };
8697
+ } catch (err) {
8698
+ clearTimeout(timer);
8699
+ const errObj = err instanceof Error ? err : new Error(String(err));
8700
+ lastError = errObj;
8701
+ attemptErrors.push({
8702
+ attempt: totalAttempts,
8703
+ model: model ?? "(default)",
8704
+ error: errObj.message
8705
+ });
8706
+ if (!isRetryable(errObj)) {
8707
+ return {
8708
+ value: null,
8709
+ succeeded: false,
8710
+ attempts: totalAttempts,
8711
+ error: errObj,
8712
+ attemptErrors
8713
+ };
8714
+ }
8715
+ if (attempt < maxAttempts - 1) {
8716
+ await sleep(backoff(attempt));
8717
+ }
8718
+ }
8719
+ }
8720
+ }
8721
+ return {
8722
+ value: null,
8723
+ succeeded: false,
8724
+ attempts: totalAttempts,
8725
+ error: lastError,
8726
+ attemptErrors
8727
+ };
8728
+ }
8729
+
8700
8730
  // src/orthogonality.ts
8701
8731
  function passOrthogonality(input) {
8702
8732
  const passes = input.passes;
@@ -8914,6 +8944,55 @@ function createSandboxPool(opts) {
8914
8944
  utilization
8915
8945
  };
8916
8946
  }
8947
+
8948
+ // src/trial-aggregator.ts
8949
+ function meanOf(xs) {
8950
+ if (xs.length === 0) return 0;
8951
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
8952
+ }
8953
+ function meanMetrics(rows) {
8954
+ if (rows.length === 0) return {};
8955
+ const keys = /* @__PURE__ */ new Set();
8956
+ for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
8957
+ const out = {};
8958
+ for (const k of keys) {
8959
+ const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
8960
+ if (xs.length > 0) out[k] = meanOf(xs);
8961
+ }
8962
+ return out;
8963
+ }
8964
+ function aggregateTrialsByMode(trials, opts) {
8965
+ const gradedTrials = trials.filter((t) => !t.error);
8966
+ const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
8967
+ const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
8968
+ if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
8969
+ return {
8970
+ meanScore: 0,
8971
+ meanCost: 0,
8972
+ meanDurationMs: 0,
8973
+ okRate: 0,
8974
+ countedTrials: 0,
8975
+ excludedFailedTrials: judgeFailed.length,
8976
+ totalTrials: trials.length,
8977
+ metrics: {},
8978
+ strictFailure: {
8979
+ failedCount: judgeFailed.length,
8980
+ firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
8981
+ }
8982
+ };
8983
+ }
8984
+ const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
8985
+ return {
8986
+ meanScore: meanOf(counted.map((t) => t.score)),
8987
+ meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
8988
+ meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
8989
+ okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
8990
+ countedTrials: counted.length,
8991
+ excludedFailedTrials: judgeFailed.length,
8992
+ totalTrials: trials.length,
8993
+ metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
8994
+ };
8995
+ }
8917
8996
  export {
8918
8997
  AgentDriver,
8919
8998
  AgentEvalError,
@@ -9003,6 +9082,7 @@ export {
9003
9082
  adversarialJudge,
9004
9083
  aggregateLlm,
9005
9084
  aggregateRunScore,
9085
+ aggregateTrialsByMode,
9006
9086
  allCriticalPassed,
9007
9087
  analyzeAntiSlop,
9008
9088
  analyzeSeries,
@@ -9025,6 +9105,7 @@ export {
9025
9105
  buildTrajectory,
9026
9106
  byteLengthRange,
9027
9107
  calibrateJudge,
9108
+ calibrateJudgeContinuous,
9028
9109
  callLlm,
9029
9110
  callLlmJson,
9030
9111
  canaryLeakView,
@@ -9049,9 +9130,12 @@ export {
9049
9130
  computeToolUseMetrics,
9050
9131
  confidenceInterval,
9051
9132
  containsAll,
9133
+ continuousAgreement,
9052
9134
  controlFailureClassFromVerification,
9053
9135
  controlRunToFeedbackTrajectory,
9054
9136
  controlRunToRunRecord,
9137
+ corpusInterRaterAgreement,
9138
+ corpusInterRaterAgreementFromJudgeScores,
9055
9139
  createAntiSlopJudge,
9056
9140
  createCompositeMutator,
9057
9141
  createCustomJudge,
@@ -9073,6 +9157,7 @@ export {
9073
9157
  defaultProviderRedactor,
9074
9158
  defaultReferenceReplayMatcher,
9075
9159
  deployGateLayer,
9160
+ discoverPersonas,
9076
9161
  distillPlaybook,
9077
9162
  dominates,
9078
9163
  estimateCost,
@@ -9275,6 +9360,7 @@ export {
9275
9360
  whitespaceCollapseMutator,
9276
9361
  wilcoxonSignedRank,
9277
9362
  withAssignedFeedbackSplit,
9363
+ withJudgeRetry,
9278
9364
  wranglerDeployRunner
9279
9365
  };
9280
9366
  //# sourceMappingURL=index.js.map