@tangle-network/agent-eval 0.25.0 → 0.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +5 -5
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
- package/dist/chunk-4U4BKCXK.js.map +1 -0
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-EDUKQ5AM.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-BhLlu-qO.d.ts} +63 -2
- package/dist/index.d.ts +279 -72
- package/dist/index.js +222 -136
- package/dist/index.js.map +1 -1
- package/dist/knowledge/index.d.ts +1 -1
- package/dist/knowledge/index.js +2 -2
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +1 -1
- package/dist/pipelines/index.js +2 -2
- package/dist/{release-report-BNgMdqPF.d.ts → release-report-CCQqnK46.d.ts} +1 -1
- package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
- package/dist/reporting.d.ts +4 -4
- package/dist/reporting.js +5 -5
- package/dist/{researcher-BPT8x_NT.d.ts → researcher-G81CWc0q.d.ts} +9 -10
- package/dist/rl.d.ts +26 -44
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-Dl4akLKX.d.ts} +13 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/wire/index.d.ts +2 -2
- package/dist/wire/index.js +1 -1
- package/docs/concepts.md +11 -0
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-EDUKQ5AM.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-K33INZHH.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -28,7 +28,7 @@ import {
|
|
|
28
28
|
pytestTestParser,
|
|
29
29
|
runTestGradedScenario,
|
|
30
30
|
vitestTestParser
|
|
31
|
-
} from "./chunk-
|
|
31
|
+
} from "./chunk-QHF6EQKK.js";
|
|
32
32
|
import {
|
|
33
33
|
classifyEuAiRisk,
|
|
34
34
|
euAiActReport,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
knowledgeReadinessTracePayload,
|
|
44
44
|
scoreKnowledgeReadiness,
|
|
45
45
|
userQuestionsForKnowledgeGaps
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-3CKU6VGU.js";
|
|
47
47
|
import {
|
|
48
48
|
controlFailureClassFromVerification,
|
|
49
49
|
controlRunToRunRecord,
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-PALJO75S.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -62,7 +62,7 @@ import {
|
|
|
62
62
|
stopOnNoProgress,
|
|
63
63
|
stopOnRepeatedAction,
|
|
64
64
|
subjectiveEval
|
|
65
|
-
} from "./chunk-
|
|
65
|
+
} from "./chunk-NCRFYPS3.js";
|
|
66
66
|
import {
|
|
67
67
|
CallbackResearcher,
|
|
68
68
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -96,7 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-SZSBQUIJ.js";
|
|
100
100
|
import {
|
|
101
101
|
RunRecordValidationError,
|
|
102
102
|
isRunRecord,
|
|
@@ -111,10 +111,10 @@ import {
|
|
|
111
111
|
judgeReplayGate,
|
|
112
112
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
113
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
114
|
+
} from "./chunk-DBIGN5MJ.js";
|
|
115
115
|
import {
|
|
116
116
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
117
|
+
} from "./chunk-RUI6SIHY.js";
|
|
118
118
|
import {
|
|
119
119
|
LlmCallError,
|
|
120
120
|
LlmClient,
|
|
@@ -128,7 +128,7 @@ import {
|
|
|
128
128
|
import {
|
|
129
129
|
evaluateInterimReleaseConfidence,
|
|
130
130
|
pairedEvalueSequence
|
|
131
|
-
} from "./chunk-
|
|
131
|
+
} from "./chunk-MAZ26DC7.js";
|
|
132
132
|
import {
|
|
133
133
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
134
|
benjaminiHochberg,
|
|
@@ -141,18 +141,26 @@ import {
|
|
|
141
141
|
requiredSampleSize,
|
|
142
142
|
researchReport,
|
|
143
143
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
144
|
+
} from "./chunk-5AKPEK5L.js";
|
|
145
145
|
import {
|
|
146
|
+
calibrateJudge,
|
|
147
|
+
calibrateJudgeContinuous,
|
|
146
148
|
cohensD,
|
|
147
149
|
confidenceInterval,
|
|
150
|
+
continuousAgreement,
|
|
151
|
+
corpusInterRaterAgreement,
|
|
152
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
148
153
|
interRaterReliability,
|
|
149
154
|
mannWhitneyU,
|
|
150
155
|
normalizeScores,
|
|
151
156
|
pairedTTest,
|
|
152
157
|
partialCredit,
|
|
158
|
+
positionalBias,
|
|
159
|
+
selfPreference,
|
|
160
|
+
verbosityBias,
|
|
153
161
|
weightedMean,
|
|
154
162
|
wilcoxonSignedRank
|
|
155
|
-
} from "./chunk-
|
|
163
|
+
} from "./chunk-R5UQJNKC.js";
|
|
156
164
|
import {
|
|
157
165
|
DEFAULT_REDACTION_RULES,
|
|
158
166
|
FileSystemTraceStore,
|
|
@@ -166,7 +174,7 @@ import {
|
|
|
166
174
|
iterateRawCalls,
|
|
167
175
|
redactString,
|
|
168
176
|
redactValue
|
|
169
|
-
} from "./chunk-
|
|
177
|
+
} from "./chunk-4U4BKCXK.js";
|
|
170
178
|
import {
|
|
171
179
|
aggregateLlm,
|
|
172
180
|
argHash,
|
|
@@ -208,7 +216,7 @@ import {
|
|
|
208
216
|
hashJson,
|
|
209
217
|
signManifest,
|
|
210
218
|
verifyManifest
|
|
211
|
-
} from "./chunk-
|
|
219
|
+
} from "./chunk-VSMTAMNK.js";
|
|
212
220
|
import {
|
|
213
221
|
AgentEvalError,
|
|
214
222
|
CaptureIntegrityError,
|
|
@@ -425,12 +433,12 @@ function ghCliClient(opts = {}) {
|
|
|
425
433
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
426
434
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
427
435
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
428
|
-
const { dirname: dirname5, join:
|
|
436
|
+
const { dirname: dirname5, join: join4, resolve } = await import("path");
|
|
429
437
|
for (const change of input.fileChanges) {
|
|
430
438
|
const abs = resolve(cwd, change.path);
|
|
431
439
|
await mkdir(dirname5(abs), { recursive: true });
|
|
432
440
|
await writeFile(abs, change.contents, "utf8");
|
|
433
|
-
await run("git", ["add",
|
|
441
|
+
await run("git", ["add", join4(change.path)]);
|
|
434
442
|
}
|
|
435
443
|
const env = {};
|
|
436
444
|
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
@@ -3073,36 +3081,36 @@ var FileSystemExperimentStore = class {
|
|
|
3073
3081
|
return idx.listRuns(experimentId);
|
|
3074
3082
|
}
|
|
3075
3083
|
async ensureDir() {
|
|
3076
|
-
const
|
|
3077
|
-
await
|
|
3084
|
+
const fs2 = await import("fs/promises");
|
|
3085
|
+
await fs2.mkdir(this.dir, { recursive: true });
|
|
3078
3086
|
}
|
|
3079
3087
|
async append(name, record) {
|
|
3080
3088
|
await this.ensureDir();
|
|
3081
|
-
const
|
|
3089
|
+
const fs2 = await import("fs/promises");
|
|
3082
3090
|
const path = await import("path");
|
|
3083
3091
|
const active = path.join(this.dir, `${name}.ndjson`);
|
|
3084
3092
|
try {
|
|
3085
|
-
const stat = await
|
|
3093
|
+
const stat = await fs2.stat(active);
|
|
3086
3094
|
if (stat.size >= this.maxBytes) {
|
|
3087
3095
|
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
3088
|
-
await
|
|
3096
|
+
await fs2.rename(active, rolled);
|
|
3089
3097
|
}
|
|
3090
3098
|
} catch {
|
|
3091
3099
|
}
|
|
3092
|
-
await
|
|
3100
|
+
await fs2.appendFile(active, `${JSON.stringify(record)}
|
|
3093
3101
|
`, "utf8");
|
|
3094
3102
|
}
|
|
3095
3103
|
async load() {
|
|
3096
3104
|
if (this.loaded && this.index) return this.index;
|
|
3097
|
-
const
|
|
3105
|
+
const fs2 = await import("fs/promises");
|
|
3098
3106
|
const path = await import("path");
|
|
3099
3107
|
const store = new InMemoryExperimentStore();
|
|
3100
3108
|
try {
|
|
3101
|
-
const entries = await
|
|
3109
|
+
const entries = await fs2.readdir(this.dir);
|
|
3102
3110
|
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
3103
3111
|
for (const file of sorted) {
|
|
3104
3112
|
const full = path.join(this.dir, file);
|
|
3105
|
-
const content = await
|
|
3113
|
+
const content = await fs2.readFile(full, "utf8");
|
|
3106
3114
|
const base = file.split(".")[0];
|
|
3107
3115
|
for (const line of content.split("\n")) {
|
|
3108
3116
|
if (!line.trim()) continue;
|
|
@@ -4956,114 +4964,6 @@ function seededShuffle(items, seed) {
|
|
|
4956
4964
|
return out;
|
|
4957
4965
|
}
|
|
4958
4966
|
|
|
4959
|
-
// src/judge-calibration.ts
|
|
4960
|
-
function calibrateJudge(golden, candidate) {
|
|
4961
|
-
const map = /* @__PURE__ */ new Map();
|
|
4962
|
-
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
4963
|
-
for (const c of candidate) {
|
|
4964
|
-
const entry = map.get(c.itemId);
|
|
4965
|
-
if (entry) entry.j = c.score;
|
|
4966
|
-
}
|
|
4967
|
-
const common = [...map.values()].filter((v) => Number.isFinite(v.j));
|
|
4968
|
-
const n = common.length;
|
|
4969
|
-
if (n < 2) {
|
|
4970
|
-
return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
|
|
4971
|
-
}
|
|
4972
|
-
const humans = common.map((c) => c.h);
|
|
4973
|
-
const judges = common.map((c) => c.j);
|
|
4974
|
-
const pearson = pearsonR(humans, judges);
|
|
4975
|
-
const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
|
|
4976
|
-
const absDiffs = common.map((c) => Math.abs(c.j - c.h));
|
|
4977
|
-
const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
|
|
4978
|
-
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
4979
|
-
return { n, pearson, kappa, mae, worstItems: worst2 };
|
|
4980
|
-
}
|
|
4981
|
-
function positionalBias(scores) {
|
|
4982
|
-
const pairs = /* @__PURE__ */ new Map();
|
|
4983
|
-
for (const s of scores) {
|
|
4984
|
-
const slot = pairs.get(s.itemId) ?? {};
|
|
4985
|
-
if (s.positionOfAInput === "first") slot.first = s.score;
|
|
4986
|
-
else if (s.positionOfAInput === "second") slot.second = s.score;
|
|
4987
|
-
pairs.set(s.itemId, slot);
|
|
4988
|
-
}
|
|
4989
|
-
const deltas = [];
|
|
4990
|
-
for (const { first, second } of pairs.values()) {
|
|
4991
|
-
if (first !== void 0 && second !== void 0) deltas.push(first - second);
|
|
4992
|
-
}
|
|
4993
|
-
if (deltas.length === 0) return { avgDelta: 0, n: 0 };
|
|
4994
|
-
return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
|
|
4995
|
-
}
|
|
4996
|
-
function verbosityBias(samples) {
|
|
4997
|
-
const n = samples.length;
|
|
4998
|
-
if (n < 3) return { pearson: NaN, n };
|
|
4999
|
-
return {
|
|
5000
|
-
pearson: pearsonR(
|
|
5001
|
-
samples.map((s) => s.outputLen),
|
|
5002
|
-
samples.map((s) => s.score)
|
|
5003
|
-
),
|
|
5004
|
-
n
|
|
5005
|
-
};
|
|
5006
|
-
}
|
|
5007
|
-
function selfPreference(samples) {
|
|
5008
|
-
const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
|
|
5009
|
-
const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
|
|
5010
|
-
if (inF.length === 0 || outF.length === 0)
|
|
5011
|
-
return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
|
|
5012
|
-
const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
|
|
5013
|
-
const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
|
|
5014
|
-
return {
|
|
5015
|
-
inFamilyMean: inMean,
|
|
5016
|
-
outOfFamilyMean: outMean,
|
|
5017
|
-
deltaMean: inMean - outMean,
|
|
5018
|
-
n: samples.length
|
|
5019
|
-
};
|
|
5020
|
-
}
|
|
5021
|
-
function pearsonR(a, b) {
|
|
5022
|
-
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5023
|
-
const mA = a.reduce((s, v) => s + v, 0) / a.length;
|
|
5024
|
-
const mB = b.reduce((s, v) => s + v, 0) / b.length;
|
|
5025
|
-
let num = 0, dA = 0, dB = 0;
|
|
5026
|
-
for (let i = 0; i < a.length; i++) {
|
|
5027
|
-
const da = a[i] - mA;
|
|
5028
|
-
const db = b[i] - mB;
|
|
5029
|
-
num += da * db;
|
|
5030
|
-
dA += da * da;
|
|
5031
|
-
dB += db * db;
|
|
5032
|
-
}
|
|
5033
|
-
if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
|
|
5034
|
-
return num / Math.sqrt(dA * dB);
|
|
5035
|
-
}
|
|
5036
|
-
function weightedKappa(a, b) {
|
|
5037
|
-
if (a.length !== b.length || a.length === 0) return NaN;
|
|
5038
|
-
const min = Math.min(...a, ...b);
|
|
5039
|
-
const max = Math.max(...a, ...b);
|
|
5040
|
-
const K = max - min + 1;
|
|
5041
|
-
if (K < 2) return 1;
|
|
5042
|
-
const observed = Array.from({ length: K }, () => new Array(K).fill(0));
|
|
5043
|
-
const rowMarg = new Array(K).fill(0);
|
|
5044
|
-
const colMarg = new Array(K).fill(0);
|
|
5045
|
-
for (let i = 0; i < a.length; i++) {
|
|
5046
|
-
const ai = a[i] - min;
|
|
5047
|
-
const bi = b[i] - min;
|
|
5048
|
-
const row = observed[ai];
|
|
5049
|
-
row[bi] = (row[bi] ?? 0) + 1;
|
|
5050
|
-
rowMarg[ai]++;
|
|
5051
|
-
colMarg[bi]++;
|
|
5052
|
-
}
|
|
5053
|
-
let num = 0;
|
|
5054
|
-
let den = 0;
|
|
5055
|
-
for (let i = 0; i < K; i++) {
|
|
5056
|
-
for (let j = 0; j < K; j++) {
|
|
5057
|
-
const w = (i - j) ** 2 / (K - 1) ** 2;
|
|
5058
|
-
const expected = rowMarg[i] * colMarg[j] / a.length;
|
|
5059
|
-
num += w * observed[i][j];
|
|
5060
|
-
den += w * expected;
|
|
5061
|
-
}
|
|
5062
|
-
}
|
|
5063
|
-
if (den === 0) return 1;
|
|
5064
|
-
return 1 - num / den;
|
|
5065
|
-
}
|
|
5066
|
-
|
|
5067
4967
|
// src/observability.ts
|
|
5068
4968
|
async function toLangfuseEnvelope(store, runId) {
|
|
5069
4969
|
const run = await store.getRun(runId);
|
|
@@ -5564,7 +5464,7 @@ async function commitBisect(options) {
|
|
|
5564
5464
|
}
|
|
5565
5465
|
async function promptBisect(options) {
|
|
5566
5466
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
5567
|
-
const
|
|
5467
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
5568
5468
|
const goodParas = split(options.good);
|
|
5569
5469
|
const badParas = split(options.bad);
|
|
5570
5470
|
if (goodParas.length !== badParas.length) {
|
|
@@ -5584,7 +5484,7 @@ async function promptBisect(options) {
|
|
|
5584
5484
|
const result = await bisect({
|
|
5585
5485
|
good: goodMask,
|
|
5586
5486
|
bad: badMask,
|
|
5587
|
-
runEval: (mask) => options.runEval(
|
|
5487
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
5588
5488
|
maxIterations: options.maxIterations ?? n + 5,
|
|
5589
5489
|
halfway: (g, b) => {
|
|
5590
5490
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -5615,12 +5515,12 @@ async function promptBisect(options) {
|
|
|
5615
5515
|
}
|
|
5616
5516
|
}
|
|
5617
5517
|
const materializedPath = result.path.map((s) => ({
|
|
5618
|
-
state:
|
|
5518
|
+
state: join4(paragraphsFor(s.state)),
|
|
5619
5519
|
score: s.score,
|
|
5620
5520
|
pass: s.pass
|
|
5621
5521
|
}));
|
|
5622
5522
|
return {
|
|
5623
|
-
culprit:
|
|
5523
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
5624
5524
|
path: materializedPath,
|
|
5625
5525
|
converged: result.converged,
|
|
5626
5526
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -8308,6 +8208,52 @@ function createCompositeMutator(opts) {
|
|
|
8308
8208
|
};
|
|
8309
8209
|
}
|
|
8310
8210
|
|
|
8211
|
+
// src/discover-personas.ts
|
|
8212
|
+
import { promises as fs } from "fs";
|
|
8213
|
+
import { basename, extname, join as join3 } from "path";
|
|
8214
|
+
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
8215
|
+
async function discoverPersonas(dir, opts = {}) {
|
|
8216
|
+
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
8217
|
+
const exclude = new Set(opts.exclude ?? []);
|
|
8218
|
+
const include = opts.include;
|
|
8219
|
+
async function walk(d) {
|
|
8220
|
+
let entries;
|
|
8221
|
+
try {
|
|
8222
|
+
const raw = await fs.readdir(d, { withFileTypes: true });
|
|
8223
|
+
entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
|
|
8224
|
+
} catch (err) {
|
|
8225
|
+
const code = err.code;
|
|
8226
|
+
if (code === "ENOENT") return [];
|
|
8227
|
+
throw err;
|
|
8228
|
+
}
|
|
8229
|
+
const out = [];
|
|
8230
|
+
for (const entry of entries) {
|
|
8231
|
+
const full = join3(d, entry.name);
|
|
8232
|
+
if (entry.isDir) {
|
|
8233
|
+
if (opts.recursive) out.push(...await walk(full));
|
|
8234
|
+
continue;
|
|
8235
|
+
}
|
|
8236
|
+
if (!pattern.test(entry.name)) continue;
|
|
8237
|
+
if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
|
|
8238
|
+
continue;
|
|
8239
|
+
if (include && include.length > 0) {
|
|
8240
|
+
const id = basename(entry.name, extname(entry.name));
|
|
8241
|
+
const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
|
|
8242
|
+
if (!matched) continue;
|
|
8243
|
+
}
|
|
8244
|
+
out.push({
|
|
8245
|
+
path: full,
|
|
8246
|
+
filename: entry.name,
|
|
8247
|
+
id: basename(entry.name, extname(entry.name))
|
|
8248
|
+
});
|
|
8249
|
+
}
|
|
8250
|
+
return out;
|
|
8251
|
+
}
|
|
8252
|
+
const results = await walk(dir);
|
|
8253
|
+
results.sort((a, b) => a.filename.localeCompare(b.filename));
|
|
8254
|
+
return results;
|
|
8255
|
+
}
|
|
8256
|
+
|
|
8311
8257
|
// src/evolution-telemetry.ts
|
|
8312
8258
|
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
|
|
8313
8259
|
import { dirname as dirname3 } from "path";
|
|
@@ -8697,6 +8643,90 @@ var JsonlTrialCache = class {
|
|
|
8697
8643
|
}
|
|
8698
8644
|
};
|
|
8699
8645
|
|
|
8646
|
+
// src/judge-retry.ts
|
|
8647
|
+
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
8648
|
+
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
8649
|
+
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
8650
|
+
var ABORT_PATTERNS = [
|
|
8651
|
+
/AbortError/i,
|
|
8652
|
+
/TimeoutError/i,
|
|
8653
|
+
/fetch failed/i,
|
|
8654
|
+
/ECONNRESET/i,
|
|
8655
|
+
/ETIMEDOUT/i,
|
|
8656
|
+
/EAI_AGAIN/i,
|
|
8657
|
+
/this operation was aborted/i,
|
|
8658
|
+
/stream.*ended.*unexpectedly/i,
|
|
8659
|
+
/socket hang up/i
|
|
8660
|
+
];
|
|
8661
|
+
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8662
|
+
function defaultIsRetryable(err) {
|
|
8663
|
+
if (err instanceof Error) {
|
|
8664
|
+
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
8665
|
+
const status = err.status;
|
|
8666
|
+
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
8667
|
+
}
|
|
8668
|
+
return false;
|
|
8669
|
+
}
|
|
8670
|
+
function sleep(ms) {
|
|
8671
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8672
|
+
}
|
|
8673
|
+
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
8674
|
+
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
8675
|
+
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8676
|
+
const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
|
|
8677
|
+
const isRetryable = policy.isRetryable ?? defaultIsRetryable;
|
|
8678
|
+
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
8679
|
+
let totalAttempts = 0;
|
|
8680
|
+
const attemptErrors = [];
|
|
8681
|
+
let lastError;
|
|
8682
|
+
for (const model of models) {
|
|
8683
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
8684
|
+
totalAttempts += 1;
|
|
8685
|
+
const controller = new AbortController();
|
|
8686
|
+
const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
|
|
8687
|
+
try {
|
|
8688
|
+
const value = await judgeFn(model, controller.signal);
|
|
8689
|
+
clearTimeout(timer);
|
|
8690
|
+
return {
|
|
8691
|
+
value,
|
|
8692
|
+
succeeded: true,
|
|
8693
|
+
attempts: totalAttempts,
|
|
8694
|
+
modelUsed: model,
|
|
8695
|
+
attemptErrors
|
|
8696
|
+
};
|
|
8697
|
+
} catch (err) {
|
|
8698
|
+
clearTimeout(timer);
|
|
8699
|
+
const errObj = err instanceof Error ? err : new Error(String(err));
|
|
8700
|
+
lastError = errObj;
|
|
8701
|
+
attemptErrors.push({
|
|
8702
|
+
attempt: totalAttempts,
|
|
8703
|
+
model: model ?? "(default)",
|
|
8704
|
+
error: errObj.message
|
|
8705
|
+
});
|
|
8706
|
+
if (!isRetryable(errObj)) {
|
|
8707
|
+
return {
|
|
8708
|
+
value: null,
|
|
8709
|
+
succeeded: false,
|
|
8710
|
+
attempts: totalAttempts,
|
|
8711
|
+
error: errObj,
|
|
8712
|
+
attemptErrors
|
|
8713
|
+
};
|
|
8714
|
+
}
|
|
8715
|
+
if (attempt < maxAttempts - 1) {
|
|
8716
|
+
await sleep(backoff(attempt));
|
|
8717
|
+
}
|
|
8718
|
+
}
|
|
8719
|
+
}
|
|
8720
|
+
}
|
|
8721
|
+
return {
|
|
8722
|
+
value: null,
|
|
8723
|
+
succeeded: false,
|
|
8724
|
+
attempts: totalAttempts,
|
|
8725
|
+
error: lastError,
|
|
8726
|
+
attemptErrors
|
|
8727
|
+
};
|
|
8728
|
+
}
|
|
8729
|
+
|
|
8700
8730
|
// src/orthogonality.ts
|
|
8701
8731
|
function passOrthogonality(input) {
|
|
8702
8732
|
const passes = input.passes;
|
|
@@ -8914,6 +8944,55 @@ function createSandboxPool(opts) {
|
|
|
8914
8944
|
utilization
|
|
8915
8945
|
};
|
|
8916
8946
|
}
|
|
8947
|
+
|
|
8948
|
+
// src/trial-aggregator.ts
|
|
8949
|
+
function meanOf(xs) {
|
|
8950
|
+
if (xs.length === 0) return 0;
|
|
8951
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
8952
|
+
}
|
|
8953
|
+
function meanMetrics(rows) {
|
|
8954
|
+
if (rows.length === 0) return {};
|
|
8955
|
+
const keys = /* @__PURE__ */ new Set();
|
|
8956
|
+
for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
|
|
8957
|
+
const out = {};
|
|
8958
|
+
for (const k of keys) {
|
|
8959
|
+
const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
|
|
8960
|
+
if (xs.length > 0) out[k] = meanOf(xs);
|
|
8961
|
+
}
|
|
8962
|
+
return out;
|
|
8963
|
+
}
|
|
8964
|
+
function aggregateTrialsByMode(trials, opts) {
|
|
8965
|
+
const gradedTrials = trials.filter((t) => !t.error);
|
|
8966
|
+
const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
|
|
8967
|
+
const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
|
|
8968
|
+
if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
|
|
8969
|
+
return {
|
|
8970
|
+
meanScore: 0,
|
|
8971
|
+
meanCost: 0,
|
|
8972
|
+
meanDurationMs: 0,
|
|
8973
|
+
okRate: 0,
|
|
8974
|
+
countedTrials: 0,
|
|
8975
|
+
excludedFailedTrials: judgeFailed.length,
|
|
8976
|
+
totalTrials: trials.length,
|
|
8977
|
+
metrics: {},
|
|
8978
|
+
strictFailure: {
|
|
8979
|
+
failedCount: judgeFailed.length,
|
|
8980
|
+
firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
|
|
8981
|
+
}
|
|
8982
|
+
};
|
|
8983
|
+
}
|
|
8984
|
+
const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
|
|
8985
|
+
return {
|
|
8986
|
+
meanScore: meanOf(counted.map((t) => t.score)),
|
|
8987
|
+
meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
|
|
8988
|
+
meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
|
|
8989
|
+
okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
|
|
8990
|
+
countedTrials: counted.length,
|
|
8991
|
+
excludedFailedTrials: judgeFailed.length,
|
|
8992
|
+
totalTrials: trials.length,
|
|
8993
|
+
metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
|
|
8994
|
+
};
|
|
8995
|
+
}
|
|
8917
8996
|
export {
|
|
8918
8997
|
AgentDriver,
|
|
8919
8998
|
AgentEvalError,
|
|
@@ -9003,6 +9082,7 @@ export {
|
|
|
9003
9082
|
adversarialJudge,
|
|
9004
9083
|
aggregateLlm,
|
|
9005
9084
|
aggregateRunScore,
|
|
9085
|
+
aggregateTrialsByMode,
|
|
9006
9086
|
allCriticalPassed,
|
|
9007
9087
|
analyzeAntiSlop,
|
|
9008
9088
|
analyzeSeries,
|
|
@@ -9025,6 +9105,7 @@ export {
|
|
|
9025
9105
|
buildTrajectory,
|
|
9026
9106
|
byteLengthRange,
|
|
9027
9107
|
calibrateJudge,
|
|
9108
|
+
calibrateJudgeContinuous,
|
|
9028
9109
|
callLlm,
|
|
9029
9110
|
callLlmJson,
|
|
9030
9111
|
canaryLeakView,
|
|
@@ -9049,9 +9130,12 @@ export {
|
|
|
9049
9130
|
computeToolUseMetrics,
|
|
9050
9131
|
confidenceInterval,
|
|
9051
9132
|
containsAll,
|
|
9133
|
+
continuousAgreement,
|
|
9052
9134
|
controlFailureClassFromVerification,
|
|
9053
9135
|
controlRunToFeedbackTrajectory,
|
|
9054
9136
|
controlRunToRunRecord,
|
|
9137
|
+
corpusInterRaterAgreement,
|
|
9138
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
9055
9139
|
createAntiSlopJudge,
|
|
9056
9140
|
createCompositeMutator,
|
|
9057
9141
|
createCustomJudge,
|
|
@@ -9073,6 +9157,7 @@ export {
|
|
|
9073
9157
|
defaultProviderRedactor,
|
|
9074
9158
|
defaultReferenceReplayMatcher,
|
|
9075
9159
|
deployGateLayer,
|
|
9160
|
+
discoverPersonas,
|
|
9076
9161
|
distillPlaybook,
|
|
9077
9162
|
dominates,
|
|
9078
9163
|
estimateCost,
|
|
@@ -9275,6 +9360,7 @@ export {
|
|
|
9275
9360
|
whitespaceCollapseMutator,
|
|
9276
9361
|
wilcoxonSignedRank,
|
|
9277
9362
|
withAssignedFeedbackSplit,
|
|
9363
|
+
withJudgeRetry,
|
|
9278
9364
|
wranglerDeployRunner
|
|
9279
9365
|
};
|
|
9280
9366
|
//# sourceMappingURL=index.js.map
|