@tangle-network/agent-eval 0.25.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +73 -0
- package/README.md +1 -0
- package/dist/{chunk-EDUKQ5AM.js → chunk-WHZMVFUV.js} +1 -1
- package/dist/chunk-WHZMVFUV.js.map +1 -0
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-D3iBCjdF.d.ts} +63 -2
- package/dist/index.d.ts +225 -8
- package/dist/index.js +414 -18
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/{release-report-BNgMdqPF.d.ts → release-report-wfUySN5F.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-BPT8x_NT.d.ts → researcher-bGkI7vCl.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-DZVXOCK_.d.ts} +12 -0
- package/docs/concepts.md +11 -0
- package/package.json +1 -1
- package/dist/chunk-EDUKQ5AM.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -96,7 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-WHZMVFUV.js";
|
|
100
100
|
import {
|
|
101
101
|
RunRecordValidationError,
|
|
102
102
|
isRunRecord,
|
|
@@ -425,12 +425,12 @@ function ghCliClient(opts = {}) {
|
|
|
425
425
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
426
426
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
427
427
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
428
|
-
const { dirname: dirname5, join:
|
|
428
|
+
const { dirname: dirname5, join: join4, resolve } = await import("path");
|
|
429
429
|
for (const change of input.fileChanges) {
|
|
430
430
|
const abs = resolve(cwd, change.path);
|
|
431
431
|
await mkdir(dirname5(abs), { recursive: true });
|
|
432
432
|
await writeFile(abs, change.contents, "utf8");
|
|
433
|
-
await run("git", ["add",
|
|
433
|
+
await run("git", ["add", join4(change.path)]);
|
|
434
434
|
}
|
|
435
435
|
const env = {};
|
|
436
436
|
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
@@ -3073,36 +3073,36 @@ var FileSystemExperimentStore = class {
|
|
|
3073
3073
|
return idx.listRuns(experimentId);
|
|
3074
3074
|
}
|
|
3075
3075
|
async ensureDir() {
|
|
3076
|
-
const
|
|
3077
|
-
await
|
|
3076
|
+
const fs2 = await import("fs/promises");
|
|
3077
|
+
await fs2.mkdir(this.dir, { recursive: true });
|
|
3078
3078
|
}
|
|
3079
3079
|
async append(name, record) {
|
|
3080
3080
|
await this.ensureDir();
|
|
3081
|
-
const
|
|
3081
|
+
const fs2 = await import("fs/promises");
|
|
3082
3082
|
const path = await import("path");
|
|
3083
3083
|
const active = path.join(this.dir, `${name}.ndjson`);
|
|
3084
3084
|
try {
|
|
3085
|
-
const stat = await
|
|
3085
|
+
const stat = await fs2.stat(active);
|
|
3086
3086
|
if (stat.size >= this.maxBytes) {
|
|
3087
3087
|
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
3088
|
-
await
|
|
3088
|
+
await fs2.rename(active, rolled);
|
|
3089
3089
|
}
|
|
3090
3090
|
} catch {
|
|
3091
3091
|
}
|
|
3092
|
-
await
|
|
3092
|
+
await fs2.appendFile(active, `${JSON.stringify(record)}
|
|
3093
3093
|
`, "utf8");
|
|
3094
3094
|
}
|
|
3095
3095
|
async load() {
|
|
3096
3096
|
if (this.loaded && this.index) return this.index;
|
|
3097
|
-
const
|
|
3097
|
+
const fs2 = await import("fs/promises");
|
|
3098
3098
|
const path = await import("path");
|
|
3099
3099
|
const store = new InMemoryExperimentStore();
|
|
3100
3100
|
try {
|
|
3101
|
-
const entries = await
|
|
3101
|
+
const entries = await fs2.readdir(this.dir);
|
|
3102
3102
|
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
3103
3103
|
for (const file of sorted) {
|
|
3104
3104
|
const full = path.join(this.dir, file);
|
|
3105
|
-
const content = await
|
|
3105
|
+
const content = await fs2.readFile(full, "utf8");
|
|
3106
3106
|
const base = file.split(".")[0];
|
|
3107
3107
|
for (const line of content.split("\n")) {
|
|
3108
3108
|
if (!line.trim()) continue;
|
|
@@ -5063,6 +5063,218 @@ function weightedKappa(a, b) {
|
|
|
5063
5063
|
if (den === 0) return 1;
|
|
5064
5064
|
return 1 - num / den;
|
|
5065
5065
|
}
|
|
5066
|
+
function continuousAgreement(scores, opts = {}) {
|
|
5067
|
+
const bootstrap = opts.bootstrap ?? 1e3;
|
|
5068
|
+
const weights = opts.weights ?? "quadratic";
|
|
5069
|
+
const seed = opts.seed ?? 12648430;
|
|
5070
|
+
const ciLevel = opts.ciLevel ?? 0.95;
|
|
5071
|
+
const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
|
|
5072
|
+
const raters = matrix[0]?.length ?? 0;
|
|
5073
|
+
const clean = matrix.filter((row) => row.length === raters);
|
|
5074
|
+
const nClean = clean.length;
|
|
5075
|
+
if (nClean < 2 || raters < 2) {
|
|
5076
|
+
return {
|
|
5077
|
+
weightedKappa: NaN,
|
|
5078
|
+
icc: NaN,
|
|
5079
|
+
pearson: NaN,
|
|
5080
|
+
spearman: NaN,
|
|
5081
|
+
ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
|
|
5082
|
+
n: nClean,
|
|
5083
|
+
raters
|
|
5084
|
+
};
|
|
5085
|
+
}
|
|
5086
|
+
const kappa = continuousWeightedKappa(clean, weights);
|
|
5087
|
+
const icc = icc21(clean);
|
|
5088
|
+
const pearson = avgPairwise(clean, pearsonR);
|
|
5089
|
+
const spearman = avgPairwise(clean, spearmanR);
|
|
5090
|
+
const ciIcc = [NaN, NaN];
|
|
5091
|
+
const ciKappa = [NaN, NaN];
|
|
5092
|
+
if (bootstrap > 0) {
|
|
5093
|
+
const rng = mulberry32(seed);
|
|
5094
|
+
const iccs = [];
|
|
5095
|
+
const kappas = [];
|
|
5096
|
+
for (let b = 0; b < bootstrap; b++) {
|
|
5097
|
+
const sample = new Array(nClean);
|
|
5098
|
+
for (let i = 0; i < nClean; i++) {
|
|
5099
|
+
sample[i] = clean[Math.floor(rng() * nClean)];
|
|
5100
|
+
}
|
|
5101
|
+
const iccB = icc21(sample);
|
|
5102
|
+
const kB = continuousWeightedKappa(sample, weights);
|
|
5103
|
+
if (Number.isFinite(iccB)) iccs.push(iccB);
|
|
5104
|
+
if (Number.isFinite(kB)) kappas.push(kB);
|
|
5105
|
+
}
|
|
5106
|
+
const [lo, hi] = percentileBounds(ciLevel);
|
|
5107
|
+
if (iccs.length > 0) {
|
|
5108
|
+
iccs.sort((a, b) => a - b);
|
|
5109
|
+
ciIcc[0] = quantile(iccs, lo);
|
|
5110
|
+
ciIcc[1] = quantile(iccs, hi);
|
|
5111
|
+
}
|
|
5112
|
+
if (kappas.length > 0) {
|
|
5113
|
+
kappas.sort((a, b) => a - b);
|
|
5114
|
+
ciKappa[0] = quantile(kappas, lo);
|
|
5115
|
+
ciKappa[1] = quantile(kappas, hi);
|
|
5116
|
+
}
|
|
5117
|
+
}
|
|
5118
|
+
return {
|
|
5119
|
+
weightedKappa: kappa,
|
|
5120
|
+
icc,
|
|
5121
|
+
pearson,
|
|
5122
|
+
spearman,
|
|
5123
|
+
ci: { icc: ciIcc, weightedKappa: ciKappa },
|
|
5124
|
+
n: nClean,
|
|
5125
|
+
raters
|
|
5126
|
+
};
|
|
5127
|
+
}
|
|
5128
|
+
function calibrateJudgeContinuous(golden, candidate, opts = {}) {
|
|
5129
|
+
const base = calibrateJudge(golden, candidate);
|
|
5130
|
+
const map = /* @__PURE__ */ new Map();
|
|
5131
|
+
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
5132
|
+
for (const c of candidate) {
|
|
5133
|
+
const entry = map.get(c.itemId);
|
|
5134
|
+
if (entry) entry.j = c.score;
|
|
5135
|
+
}
|
|
5136
|
+
const rows = [];
|
|
5137
|
+
for (const v of map.values()) {
|
|
5138
|
+
if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
|
|
5139
|
+
}
|
|
5140
|
+
const agreement = continuousAgreement(rows, opts);
|
|
5141
|
+
return {
|
|
5142
|
+
...base,
|
|
5143
|
+
weightedKappaContinuous: agreement.weightedKappa,
|
|
5144
|
+
icc: agreement.icc,
|
|
5145
|
+
spearman: agreement.spearman,
|
|
5146
|
+
ci: agreement.ci
|
|
5147
|
+
};
|
|
5148
|
+
}
|
|
5149
|
+
function continuousWeightedKappa(rows, scheme) {
|
|
5150
|
+
if (rows.length === 0) return NaN;
|
|
5151
|
+
const raters = rows[0].length;
|
|
5152
|
+
if (raters < 2) return NaN;
|
|
5153
|
+
const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
|
|
5154
|
+
let sum2 = 0;
|
|
5155
|
+
let pairs = 0;
|
|
5156
|
+
for (let r1 = 0; r1 < raters; r1++) {
|
|
5157
|
+
for (let r2 = r1 + 1; r2 < raters; r2++) {
|
|
5158
|
+
const a = rows.map((row) => row[r1]);
|
|
5159
|
+
const b = rows.map((row) => row[r2]);
|
|
5160
|
+
const n = a.length;
|
|
5161
|
+
let obs = 0;
|
|
5162
|
+
for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
|
|
5163
|
+
obs /= n;
|
|
5164
|
+
let exp = 0;
|
|
5165
|
+
for (let i = 0; i < n; i++) {
|
|
5166
|
+
for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
|
|
5167
|
+
}
|
|
5168
|
+
exp /= n * n;
|
|
5169
|
+
if (exp === 0) {
|
|
5170
|
+
sum2 += obs === 0 ? 1 : 0;
|
|
5171
|
+
} else {
|
|
5172
|
+
sum2 += 1 - obs / exp;
|
|
5173
|
+
}
|
|
5174
|
+
pairs++;
|
|
5175
|
+
}
|
|
5176
|
+
}
|
|
5177
|
+
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5178
|
+
}
|
|
5179
|
+
function icc21(rows) {
|
|
5180
|
+
const n = rows.length;
|
|
5181
|
+
if (n < 2) return NaN;
|
|
5182
|
+
const k = rows[0].length;
|
|
5183
|
+
if (k < 2) return NaN;
|
|
5184
|
+
const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
|
|
5185
|
+
const colMeans = new Array(k).fill(0);
|
|
5186
|
+
for (let j = 0; j < k; j++) {
|
|
5187
|
+
let s = 0;
|
|
5188
|
+
for (let i = 0; i < n; i++) s += rows[i][j];
|
|
5189
|
+
colMeans[j] = s / n;
|
|
5190
|
+
}
|
|
5191
|
+
let grand = 0;
|
|
5192
|
+
for (let i = 0; i < n; i++) grand += rowMeans[i];
|
|
5193
|
+
grand /= n;
|
|
5194
|
+
let ssR = 0;
|
|
5195
|
+
for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
|
|
5196
|
+
ssR *= k;
|
|
5197
|
+
let ssC = 0;
|
|
5198
|
+
for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
|
|
5199
|
+
ssC *= n;
|
|
5200
|
+
let ssT = 0;
|
|
5201
|
+
for (let i = 0; i < n; i++) {
|
|
5202
|
+
for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
|
|
5203
|
+
}
|
|
5204
|
+
const ssE = ssT - ssR - ssC;
|
|
5205
|
+
const dfR = n - 1;
|
|
5206
|
+
const dfC = k - 1;
|
|
5207
|
+
const dfE = (n - 1) * (k - 1);
|
|
5208
|
+
const msR = ssR / dfR;
|
|
5209
|
+
const msC = ssC / dfC;
|
|
5210
|
+
const msE = dfE > 0 ? ssE / dfE : 0;
|
|
5211
|
+
const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
|
|
5212
|
+
if (denom === 0) {
|
|
5213
|
+
return msR === 0 && msE === 0 ? 1 : 0;
|
|
5214
|
+
}
|
|
5215
|
+
return (msR - msE) / denom;
|
|
5216
|
+
}
|
|
5217
|
+
function avgPairwise(rows, fn) {
|
|
5218
|
+
const k = rows[0]?.length ?? 0;
|
|
5219
|
+
if (k < 2) return NaN;
|
|
5220
|
+
let sum2 = 0;
|
|
5221
|
+
let pairs = 0;
|
|
5222
|
+
for (let i = 0; i < k; i++) {
|
|
5223
|
+
for (let j = i + 1; j < k; j++) {
|
|
5224
|
+
const a = rows.map((row) => row[i]);
|
|
5225
|
+
const b = rows.map((row) => row[j]);
|
|
5226
|
+
const r = fn(a, b);
|
|
5227
|
+
if (Number.isFinite(r)) {
|
|
5228
|
+
sum2 += r;
|
|
5229
|
+
pairs++;
|
|
5230
|
+
}
|
|
5231
|
+
}
|
|
5232
|
+
}
|
|
5233
|
+
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5234
|
+
}
|
|
5235
|
+
function spearmanR(a, b) {
|
|
5236
|
+
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5237
|
+
return pearsonR(rankWithTies(a), rankWithTies(b));
|
|
5238
|
+
}
|
|
5239
|
+
function rankWithTies(xs) {
|
|
5240
|
+
const n = xs.length;
|
|
5241
|
+
const indexed = xs.map((v, i2) => ({ v, i: i2 }));
|
|
5242
|
+
indexed.sort((x, y) => x.v - y.v);
|
|
5243
|
+
const ranks = new Array(n).fill(0);
|
|
5244
|
+
let i = 0;
|
|
5245
|
+
while (i < n) {
|
|
5246
|
+
let j = i;
|
|
5247
|
+
while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
|
|
5248
|
+
const avg = (i + j) / 2 + 1;
|
|
5249
|
+
for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
|
|
5250
|
+
i = j + 1;
|
|
5251
|
+
}
|
|
5252
|
+
return ranks;
|
|
5253
|
+
}
|
|
5254
|
+
function mulberry32(seed) {
|
|
5255
|
+
let a = seed >>> 0;
|
|
5256
|
+
return () => {
|
|
5257
|
+
a = a + 1831565813 >>> 0;
|
|
5258
|
+
let t = a;
|
|
5259
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
5260
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
5261
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
5262
|
+
};
|
|
5263
|
+
}
|
|
5264
|
+
function percentileBounds(ciLevel) {
|
|
5265
|
+
const tail = (1 - ciLevel) / 2;
|
|
5266
|
+
return [tail, 1 - tail];
|
|
5267
|
+
}
|
|
5268
|
+
function quantile(sorted, q) {
|
|
5269
|
+
if (sorted.length === 0) return NaN;
|
|
5270
|
+
if (sorted.length === 1) return sorted[0];
|
|
5271
|
+
const pos = q * (sorted.length - 1);
|
|
5272
|
+
const lo = Math.floor(pos);
|
|
5273
|
+
const hi = Math.ceil(pos);
|
|
5274
|
+
if (lo === hi) return sorted[lo];
|
|
5275
|
+
const frac = pos - lo;
|
|
5276
|
+
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
5277
|
+
}
|
|
5066
5278
|
|
|
5067
5279
|
// src/observability.ts
|
|
5068
5280
|
async function toLangfuseEnvelope(store, runId) {
|
|
@@ -5564,7 +5776,7 @@ async function commitBisect(options) {
|
|
|
5564
5776
|
}
|
|
5565
5777
|
async function promptBisect(options) {
|
|
5566
5778
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
5567
|
-
const
|
|
5779
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
5568
5780
|
const goodParas = split(options.good);
|
|
5569
5781
|
const badParas = split(options.bad);
|
|
5570
5782
|
if (goodParas.length !== badParas.length) {
|
|
@@ -5584,7 +5796,7 @@ async function promptBisect(options) {
|
|
|
5584
5796
|
const result = await bisect({
|
|
5585
5797
|
good: goodMask,
|
|
5586
5798
|
bad: badMask,
|
|
5587
|
-
runEval: (mask) => options.runEval(
|
|
5799
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
5588
5800
|
maxIterations: options.maxIterations ?? n + 5,
|
|
5589
5801
|
halfway: (g, b) => {
|
|
5590
5802
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -5615,12 +5827,12 @@ async function promptBisect(options) {
|
|
|
5615
5827
|
}
|
|
5616
5828
|
}
|
|
5617
5829
|
const materializedPath = result.path.map((s) => ({
|
|
5618
|
-
state:
|
|
5830
|
+
state: join4(paragraphsFor(s.state)),
|
|
5619
5831
|
score: s.score,
|
|
5620
5832
|
pass: s.pass
|
|
5621
5833
|
}));
|
|
5622
5834
|
return {
|
|
5623
|
-
culprit:
|
|
5835
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
5624
5836
|
path: materializedPath,
|
|
5625
5837
|
converged: result.converged,
|
|
5626
5838
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -5865,7 +6077,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
5865
6077
|
runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
|
|
5866
6078
|
}
|
|
5867
6079
|
const runCounts = [...runCountByScenario.values()];
|
|
5868
|
-
const p25 = runCounts.length > 0 ?
|
|
6080
|
+
const p25 = runCounts.length > 0 ? quantile2(runCounts, 0.25) : 0;
|
|
5869
6081
|
for (const s of scenarios) {
|
|
5870
6082
|
const count = runCountByScenario.get(s.id) ?? 0;
|
|
5871
6083
|
if (count <= p25 && count < 3) {
|
|
@@ -5919,7 +6131,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
5919
6131
|
}
|
|
5920
6132
|
return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
|
|
5921
6133
|
}
|
|
5922
|
-
function
|
|
6134
|
+
function quantile2(xs, p) {
|
|
5923
6135
|
const sorted = [...xs].sort((a, b) => a - b);
|
|
5924
6136
|
const idx = p * (sorted.length - 1);
|
|
5925
6137
|
const lo = Math.floor(idx);
|
|
@@ -8308,6 +8520,52 @@ function createCompositeMutator(opts) {
|
|
|
8308
8520
|
};
|
|
8309
8521
|
}
|
|
8310
8522
|
|
|
8523
|
+
// src/discover-personas.ts
|
|
8524
|
+
import { promises as fs } from "fs";
|
|
8525
|
+
import { basename, extname, join as join3 } from "path";
|
|
8526
|
+
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
8527
|
+
async function discoverPersonas(dir, opts = {}) {
|
|
8528
|
+
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
8529
|
+
const exclude = new Set(opts.exclude ?? []);
|
|
8530
|
+
const include = opts.include;
|
|
8531
|
+
async function walk(d) {
|
|
8532
|
+
let entries;
|
|
8533
|
+
try {
|
|
8534
|
+
const raw = await fs.readdir(d, { withFileTypes: true });
|
|
8535
|
+
entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
|
|
8536
|
+
} catch (err) {
|
|
8537
|
+
const code = err.code;
|
|
8538
|
+
if (code === "ENOENT") return [];
|
|
8539
|
+
throw err;
|
|
8540
|
+
}
|
|
8541
|
+
const out = [];
|
|
8542
|
+
for (const entry of entries) {
|
|
8543
|
+
const full = join3(d, entry.name);
|
|
8544
|
+
if (entry.isDir) {
|
|
8545
|
+
if (opts.recursive) out.push(...await walk(full));
|
|
8546
|
+
continue;
|
|
8547
|
+
}
|
|
8548
|
+
if (!pattern.test(entry.name)) continue;
|
|
8549
|
+
if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
|
|
8550
|
+
continue;
|
|
8551
|
+
if (include && include.length > 0) {
|
|
8552
|
+
const id = basename(entry.name, extname(entry.name));
|
|
8553
|
+
const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
|
|
8554
|
+
if (!matched) continue;
|
|
8555
|
+
}
|
|
8556
|
+
out.push({
|
|
8557
|
+
path: full,
|
|
8558
|
+
filename: entry.name,
|
|
8559
|
+
id: basename(entry.name, extname(entry.name))
|
|
8560
|
+
});
|
|
8561
|
+
}
|
|
8562
|
+
return out;
|
|
8563
|
+
}
|
|
8564
|
+
const results = await walk(dir);
|
|
8565
|
+
results.sort((a, b) => a.filename.localeCompare(b.filename));
|
|
8566
|
+
return results;
|
|
8567
|
+
}
|
|
8568
|
+
|
|
8311
8569
|
// src/evolution-telemetry.ts
|
|
8312
8570
|
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
|
|
8313
8571
|
import { dirname as dirname3 } from "path";
|
|
@@ -8697,6 +8955,90 @@ var JsonlTrialCache = class {
|
|
|
8697
8955
|
}
|
|
8698
8956
|
};
|
|
8699
8957
|
|
|
8958
|
+
// src/judge-retry.ts
|
|
8959
|
+
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
8960
|
+
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
8961
|
+
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
8962
|
+
var ABORT_PATTERNS = [
|
|
8963
|
+
/AbortError/i,
|
|
8964
|
+
/TimeoutError/i,
|
|
8965
|
+
/fetch failed/i,
|
|
8966
|
+
/ECONNRESET/i,
|
|
8967
|
+
/ETIMEDOUT/i,
|
|
8968
|
+
/EAI_AGAIN/i,
|
|
8969
|
+
/this operation was aborted/i,
|
|
8970
|
+
/stream.*ended.*unexpectedly/i,
|
|
8971
|
+
/socket hang up/i
|
|
8972
|
+
];
|
|
8973
|
+
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8974
|
+
function defaultIsRetryable(err) {
|
|
8975
|
+
if (err instanceof Error) {
|
|
8976
|
+
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
8977
|
+
const status = err.status;
|
|
8978
|
+
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
8979
|
+
}
|
|
8980
|
+
return false;
|
|
8981
|
+
}
|
|
8982
|
+
function sleep(ms) {
|
|
8983
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8984
|
+
}
|
|
8985
|
+
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
8986
|
+
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
8987
|
+
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8988
|
+
const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
|
|
8989
|
+
const isRetryable = policy.isRetryable ?? defaultIsRetryable;
|
|
8990
|
+
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
8991
|
+
let totalAttempts = 0;
|
|
8992
|
+
const attemptErrors = [];
|
|
8993
|
+
let lastError;
|
|
8994
|
+
for (const model of models) {
|
|
8995
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
8996
|
+
totalAttempts += 1;
|
|
8997
|
+
const controller = new AbortController();
|
|
8998
|
+
const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
|
|
8999
|
+
try {
|
|
9000
|
+
const value = await judgeFn(model, controller.signal);
|
|
9001
|
+
clearTimeout(timer);
|
|
9002
|
+
return {
|
|
9003
|
+
value,
|
|
9004
|
+
succeeded: true,
|
|
9005
|
+
attempts: totalAttempts,
|
|
9006
|
+
modelUsed: model,
|
|
9007
|
+
attemptErrors
|
|
9008
|
+
};
|
|
9009
|
+
} catch (err) {
|
|
9010
|
+
clearTimeout(timer);
|
|
9011
|
+
const errObj = err instanceof Error ? err : new Error(String(err));
|
|
9012
|
+
lastError = errObj;
|
|
9013
|
+
attemptErrors.push({
|
|
9014
|
+
attempt: totalAttempts,
|
|
9015
|
+
model: model ?? "(default)",
|
|
9016
|
+
error: errObj.message
|
|
9017
|
+
});
|
|
9018
|
+
if (!isRetryable(errObj)) {
|
|
9019
|
+
return {
|
|
9020
|
+
value: null,
|
|
9021
|
+
succeeded: false,
|
|
9022
|
+
attempts: totalAttempts,
|
|
9023
|
+
error: errObj,
|
|
9024
|
+
attemptErrors
|
|
9025
|
+
};
|
|
9026
|
+
}
|
|
9027
|
+
if (attempt < maxAttempts - 1) {
|
|
9028
|
+
await sleep(backoff(attempt));
|
|
9029
|
+
}
|
|
9030
|
+
}
|
|
9031
|
+
}
|
|
9032
|
+
}
|
|
9033
|
+
return {
|
|
9034
|
+
value: null,
|
|
9035
|
+
succeeded: false,
|
|
9036
|
+
attempts: totalAttempts,
|
|
9037
|
+
error: lastError,
|
|
9038
|
+
attemptErrors
|
|
9039
|
+
};
|
|
9040
|
+
}
|
|
9041
|
+
|
|
8700
9042
|
// src/orthogonality.ts
|
|
8701
9043
|
function passOrthogonality(input) {
|
|
8702
9044
|
const passes = input.passes;
|
|
@@ -8914,6 +9256,55 @@ function createSandboxPool(opts) {
|
|
|
8914
9256
|
utilization
|
|
8915
9257
|
};
|
|
8916
9258
|
}
|
|
9259
|
+
|
|
9260
|
+
// src/trial-aggregator.ts
|
|
9261
|
+
function meanOf(xs) {
|
|
9262
|
+
if (xs.length === 0) return 0;
|
|
9263
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
9264
|
+
}
|
|
9265
|
+
function meanMetrics(rows) {
|
|
9266
|
+
if (rows.length === 0) return {};
|
|
9267
|
+
const keys = /* @__PURE__ */ new Set();
|
|
9268
|
+
for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
|
|
9269
|
+
const out = {};
|
|
9270
|
+
for (const k of keys) {
|
|
9271
|
+
const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
|
|
9272
|
+
if (xs.length > 0) out[k] = meanOf(xs);
|
|
9273
|
+
}
|
|
9274
|
+
return out;
|
|
9275
|
+
}
|
|
9276
|
+
function aggregateTrialsByMode(trials, opts) {
|
|
9277
|
+
const gradedTrials = trials.filter((t) => !t.error);
|
|
9278
|
+
const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
|
|
9279
|
+
const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
|
|
9280
|
+
if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
|
|
9281
|
+
return {
|
|
9282
|
+
meanScore: 0,
|
|
9283
|
+
meanCost: 0,
|
|
9284
|
+
meanDurationMs: 0,
|
|
9285
|
+
okRate: 0,
|
|
9286
|
+
countedTrials: 0,
|
|
9287
|
+
excludedFailedTrials: judgeFailed.length,
|
|
9288
|
+
totalTrials: trials.length,
|
|
9289
|
+
metrics: {},
|
|
9290
|
+
strictFailure: {
|
|
9291
|
+
failedCount: judgeFailed.length,
|
|
9292
|
+
firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
|
|
9293
|
+
}
|
|
9294
|
+
};
|
|
9295
|
+
}
|
|
9296
|
+
const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
|
|
9297
|
+
return {
|
|
9298
|
+
meanScore: meanOf(counted.map((t) => t.score)),
|
|
9299
|
+
meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
|
|
9300
|
+
meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
|
|
9301
|
+
okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
|
|
9302
|
+
countedTrials: counted.length,
|
|
9303
|
+
excludedFailedTrials: judgeFailed.length,
|
|
9304
|
+
totalTrials: trials.length,
|
|
9305
|
+
metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
|
|
9306
|
+
};
|
|
9307
|
+
}
|
|
8917
9308
|
export {
|
|
8918
9309
|
AgentDriver,
|
|
8919
9310
|
AgentEvalError,
|
|
@@ -9003,6 +9394,7 @@ export {
|
|
|
9003
9394
|
adversarialJudge,
|
|
9004
9395
|
aggregateLlm,
|
|
9005
9396
|
aggregateRunScore,
|
|
9397
|
+
aggregateTrialsByMode,
|
|
9006
9398
|
allCriticalPassed,
|
|
9007
9399
|
analyzeAntiSlop,
|
|
9008
9400
|
analyzeSeries,
|
|
@@ -9025,6 +9417,7 @@ export {
|
|
|
9025
9417
|
buildTrajectory,
|
|
9026
9418
|
byteLengthRange,
|
|
9027
9419
|
calibrateJudge,
|
|
9420
|
+
calibrateJudgeContinuous,
|
|
9028
9421
|
callLlm,
|
|
9029
9422
|
callLlmJson,
|
|
9030
9423
|
canaryLeakView,
|
|
@@ -9049,6 +9442,7 @@ export {
|
|
|
9049
9442
|
computeToolUseMetrics,
|
|
9050
9443
|
confidenceInterval,
|
|
9051
9444
|
containsAll,
|
|
9445
|
+
continuousAgreement,
|
|
9052
9446
|
controlFailureClassFromVerification,
|
|
9053
9447
|
controlRunToFeedbackTrajectory,
|
|
9054
9448
|
controlRunToRunRecord,
|
|
@@ -9073,6 +9467,7 @@ export {
|
|
|
9073
9467
|
defaultProviderRedactor,
|
|
9074
9468
|
defaultReferenceReplayMatcher,
|
|
9075
9469
|
deployGateLayer,
|
|
9470
|
+
discoverPersonas,
|
|
9076
9471
|
distillPlaybook,
|
|
9077
9472
|
dominates,
|
|
9078
9473
|
estimateCost,
|
|
@@ -9275,6 +9670,7 @@ export {
|
|
|
9275
9670
|
whitespaceCollapseMutator,
|
|
9276
9671
|
wilcoxonSignedRank,
|
|
9277
9672
|
withAssignedFeedbackSplit,
|
|
9673
|
+
withJudgeRetry,
|
|
9278
9674
|
wranglerDeployRunner
|
|
9279
9675
|
};
|
|
9280
9676
|
//# sourceMappingURL=index.js.map
|