@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
import {
|
|
2
|
+
DEFAULT_MUTATION_PRIMITIVES,
|
|
2
3
|
DEFAULT_RED_TEAM_CORPUS,
|
|
3
4
|
Dataset,
|
|
4
5
|
HoldoutLockedError,
|
|
6
|
+
buildReflectionPrompt,
|
|
5
7
|
hashScenarios,
|
|
8
|
+
parseReflectionResponse,
|
|
6
9
|
redTeamDataset,
|
|
7
10
|
redTeamReport,
|
|
8
11
|
runCanaries,
|
|
9
12
|
scoreRedTeamOutput,
|
|
10
13
|
toolNamesForRun
|
|
11
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-N4SBKEPJ.js";
|
|
12
15
|
import {
|
|
13
16
|
BENCHMARK_SPLIT_SEED,
|
|
14
17
|
benchmarks_exports,
|
|
@@ -19,10 +22,9 @@ import {
|
|
|
19
22
|
classifyFailure,
|
|
20
23
|
compareToBaseline,
|
|
21
24
|
computeToolUseMetrics,
|
|
22
|
-
failureClusterView,
|
|
23
25
|
iqr,
|
|
24
26
|
welchsTTest
|
|
25
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-H4TOS272.js";
|
|
26
28
|
import {
|
|
27
29
|
exportTrainingData,
|
|
28
30
|
toNdjson
|
|
@@ -74,63 +76,16 @@ import {
|
|
|
74
76
|
stopOnRepeatedAction,
|
|
75
77
|
subjectiveEval
|
|
76
78
|
} from "./chunk-NCRFYPS3.js";
|
|
77
|
-
import {
|
|
78
|
-
CallbackResearcher,
|
|
79
|
-
DEFAULT_MUTATION_PRIMITIVES,
|
|
80
|
-
FileSystemFeedbackTrajectoryStore,
|
|
81
|
-
HeldOutGate,
|
|
82
|
-
InMemoryFeedbackTrajectoryStore,
|
|
83
|
-
InMemoryTrialCache,
|
|
84
|
-
NoopResearcher,
|
|
85
|
-
assignFeedbackSplit,
|
|
86
|
-
buildReflectionPrompt,
|
|
87
|
-
controlRunToFeedbackTrajectory,
|
|
88
|
-
createFeedbackTrajectory,
|
|
89
|
-
crowdingDistance,
|
|
90
|
-
defaultMultiShotObjectives,
|
|
91
|
-
dominates,
|
|
92
|
-
feedbackTrajectoriesToDatasetScenarios,
|
|
93
|
-
feedbackTrajectoriesToOptimizerRows,
|
|
94
|
-
feedbackTrajectoryToDatasetScenario,
|
|
95
|
-
feedbackTrajectoryToOptimizerRow,
|
|
96
|
-
paretoFrontier,
|
|
97
|
-
paretoFrontierWithCrowding,
|
|
98
|
-
parseFeedbackTrajectoriesJsonl,
|
|
99
|
-
parseReflectionResponse,
|
|
100
|
-
renderPreferenceMemoryMarkdown,
|
|
101
|
-
replayFeedbackTrajectories,
|
|
102
|
-
replayFeedbackTrajectory,
|
|
103
|
-
runMultiShotOptimization,
|
|
104
|
-
runPromptEvolution,
|
|
105
|
-
scalarScore,
|
|
106
|
-
serializeFeedbackTrajectoriesJsonl,
|
|
107
|
-
summarizePreferenceMemory,
|
|
108
|
-
trialTraceFromMultiShotTrial,
|
|
109
|
-
withAssignedFeedbackSplit
|
|
110
|
-
} from "./chunk-DMW5VENN.js";
|
|
111
79
|
import {
|
|
112
80
|
assertReleaseConfidence,
|
|
113
81
|
bootstrapCi,
|
|
114
82
|
evaluateReleaseConfidence,
|
|
115
83
|
judgeReplayGate,
|
|
116
|
-
releaseTraceEvidenceFromMultiShotTrials,
|
|
117
84
|
renderReleaseReport
|
|
118
|
-
} from "./chunk-
|
|
85
|
+
} from "./chunk-KQ26DYTQ.js";
|
|
119
86
|
import {
|
|
120
87
|
runEvalCampaign
|
|
121
|
-
} from "./chunk-
|
|
122
|
-
import {
|
|
123
|
-
LlmCallError,
|
|
124
|
-
LlmClient,
|
|
125
|
-
LlmRouteAssertionError,
|
|
126
|
-
assertLlmRoute,
|
|
127
|
-
backoffMs,
|
|
128
|
-
callLlm,
|
|
129
|
-
callLlmJson,
|
|
130
|
-
isTransientLlmError,
|
|
131
|
-
probeLlm,
|
|
132
|
-
stripFencedJson
|
|
133
|
-
} from "./chunk-VXNVVBZO.js";
|
|
88
|
+
} from "./chunk-PD3MH6WU.js";
|
|
134
89
|
import {
|
|
135
90
|
AGENT_PROFILE_KINDS,
|
|
136
91
|
AgentProfileCellValidationError,
|
|
@@ -150,17 +105,15 @@ import {
|
|
|
150
105
|
validateRunRecord,
|
|
151
106
|
verifyAgentProfileCell
|
|
152
107
|
} from "./chunk-BWZEGTES.js";
|
|
153
|
-
import {
|
|
154
|
-
evaluateInterimReleaseConfidence,
|
|
155
|
-
pairedEvalueSequence
|
|
156
|
-
} from "./chunk-MAZ26DC7.js";
|
|
157
108
|
import {
|
|
158
109
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
110
|
+
evaluateInterimReleaseConfidence,
|
|
159
111
|
gainHistogram,
|
|
112
|
+
pairedEvalueSequence,
|
|
160
113
|
paretoChart,
|
|
161
114
|
researchReport,
|
|
162
115
|
summaryTable
|
|
163
|
-
} from "./chunk-
|
|
116
|
+
} from "./chunk-MNL6LXGQ.js";
|
|
164
117
|
import {
|
|
165
118
|
benjaminiHochberg,
|
|
166
119
|
bonferroni,
|
|
@@ -250,13 +203,6 @@ import {
|
|
|
250
203
|
assertRunCaptured,
|
|
251
204
|
throwIfRunIncomplete
|
|
252
205
|
} from "./chunk-UBPIXOC4.js";
|
|
253
|
-
import {
|
|
254
|
-
FileSystemRawProviderSink,
|
|
255
|
-
InMemoryRawProviderSink,
|
|
256
|
-
NoopRawProviderSink,
|
|
257
|
-
defaultProviderRedactor,
|
|
258
|
-
providerFromBaseUrl
|
|
259
|
-
} from "./chunk-PC4UYEBM.js";
|
|
260
206
|
import {
|
|
261
207
|
TraceEmitter,
|
|
262
208
|
llmSpanFromProvider
|
|
@@ -268,6 +214,25 @@ import {
|
|
|
268
214
|
signManifest,
|
|
269
215
|
verifyManifest
|
|
270
216
|
} from "./chunk-VSMTAMNK.js";
|
|
217
|
+
import {
|
|
218
|
+
LlmCallError,
|
|
219
|
+
LlmClient,
|
|
220
|
+
LlmRouteAssertionError,
|
|
221
|
+
assertLlmRoute,
|
|
222
|
+
backoffMs,
|
|
223
|
+
callLlm,
|
|
224
|
+
callLlmJson,
|
|
225
|
+
isTransientLlmError,
|
|
226
|
+
probeLlm,
|
|
227
|
+
stripFencedJson
|
|
228
|
+
} from "./chunk-VXNVVBZO.js";
|
|
229
|
+
import {
|
|
230
|
+
FileSystemRawProviderSink,
|
|
231
|
+
InMemoryRawProviderSink,
|
|
232
|
+
NoopRawProviderSink,
|
|
233
|
+
defaultProviderRedactor,
|
|
234
|
+
providerFromBaseUrl
|
|
235
|
+
} from "./chunk-PC4UYEBM.js";
|
|
271
236
|
import {
|
|
272
237
|
AgentEvalError,
|
|
273
238
|
CaptureIntegrityError,
|
|
@@ -853,8 +818,8 @@ function createJudgeAdapter(opts) {
|
|
|
853
818
|
cost: opts.cost ?? { kind: "llm" },
|
|
854
819
|
version: `judge-${ADAPTER_REV}`,
|
|
855
820
|
async analyze(input) {
|
|
856
|
-
const
|
|
857
|
-
return
|
|
821
|
+
const scores2 = await opts.judge(opts.tcloud, input);
|
|
822
|
+
return scores2.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
|
|
858
823
|
}
|
|
859
824
|
};
|
|
860
825
|
}
|
|
@@ -2203,10 +2168,10 @@ function ghCliClient(opts = {}) {
|
|
|
2203
2168
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2204
2169
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2205
2170
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2206
|
-
const { dirname:
|
|
2171
|
+
const { dirname: dirname4, join: join4, resolve } = await import("path");
|
|
2207
2172
|
for (const change of input.fileChanges) {
|
|
2208
2173
|
const abs = resolve(cwd, change.path);
|
|
2209
|
-
await mkdir(
|
|
2174
|
+
await mkdir(dirname4(abs), { recursive: true });
|
|
2210
2175
|
await writeFile(abs, change.contents, "utf8");
|
|
2211
2176
|
await run("git", ["add", join4(change.path)]);
|
|
2212
2177
|
}
|
|
@@ -2404,8 +2369,8 @@ async function executeScenario(tc, scenario, config) {
|
|
|
2404
2369
|
console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
|
|
2405
2370
|
await new Promise((r) => setTimeout(r, wait));
|
|
2406
2371
|
}
|
|
2407
|
-
const
|
|
2408
|
-
judgeResults.push(
|
|
2372
|
+
const scores2 = await judge(tc, judgeInput);
|
|
2373
|
+
judgeResults.push(scores2);
|
|
2409
2374
|
await new Promise((r) => setTimeout(r, 3e3));
|
|
2410
2375
|
break;
|
|
2411
2376
|
} catch (err) {
|
|
@@ -3118,200 +3083,400 @@ ${lastResponse}` : "No conversation yet. Send your opening message \u2014 in cha
|
|
|
3118
3083
|
return content.trim();
|
|
3119
3084
|
}
|
|
3120
3085
|
|
|
3121
|
-
// src/
|
|
3122
|
-
|
|
3086
|
+
// src/feedback-trajectory.ts
|
|
3087
|
+
var DEFAULT_SPLIT_POLICY = {
|
|
3088
|
+
trainPct: 70,
|
|
3089
|
+
devPct: 15,
|
|
3090
|
+
testPct: 10,
|
|
3091
|
+
holdoutPct: 5
|
|
3092
|
+
};
|
|
3093
|
+
var InMemoryFeedbackTrajectoryStore = class {
|
|
3094
|
+
trajectories = /* @__PURE__ */ new Map();
|
|
3095
|
+
async save(trajectory) {
|
|
3096
|
+
this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
|
|
3097
|
+
}
|
|
3098
|
+
async get(id) {
|
|
3099
|
+
const trajectory = this.trajectories.get(id);
|
|
3100
|
+
return trajectory ? cloneTrajectory(trajectory) : null;
|
|
3101
|
+
}
|
|
3102
|
+
async list(filter = {}) {
|
|
3103
|
+
return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
|
|
3104
|
+
}
|
|
3105
|
+
async appendAttempt(id, attempt) {
|
|
3106
|
+
const trajectory = this.trajectories.get(id);
|
|
3107
|
+
if (!trajectory)
|
|
3108
|
+
throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
3109
|
+
const next = cloneTrajectory({
|
|
3110
|
+
...trajectory,
|
|
3111
|
+
attempts: [...trajectory.attempts, attempt],
|
|
3112
|
+
updatedAt: attempt.createdAt
|
|
3113
|
+
});
|
|
3114
|
+
this.trajectories.set(id, next);
|
|
3115
|
+
return cloneTrajectory(next);
|
|
3116
|
+
}
|
|
3117
|
+
async appendLabel(id, label, attemptId) {
|
|
3118
|
+
const trajectory = this.trajectories.get(id);
|
|
3119
|
+
if (!trajectory)
|
|
3120
|
+
throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
3121
|
+
const attempts = attemptId ? trajectory.attempts.map(
|
|
3122
|
+
(attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt
|
|
3123
|
+
) : trajectory.attempts;
|
|
3124
|
+
const next = cloneTrajectory({
|
|
3125
|
+
...trajectory,
|
|
3126
|
+
attempts,
|
|
3127
|
+
labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
|
|
3128
|
+
updatedAt: label.createdAt
|
|
3129
|
+
});
|
|
3130
|
+
this.trajectories.set(id, next);
|
|
3131
|
+
return cloneTrajectory(next);
|
|
3132
|
+
}
|
|
3133
|
+
};
|
|
3134
|
+
var FileSystemFeedbackTrajectoryStore = class {
|
|
3135
|
+
dir;
|
|
3136
|
+
memory = new InMemoryFeedbackTrajectoryStore();
|
|
3137
|
+
loaded = false;
|
|
3138
|
+
constructor(options) {
|
|
3139
|
+
this.dir = options.dir;
|
|
3140
|
+
}
|
|
3141
|
+
async save(trajectory) {
|
|
3142
|
+
await this.load();
|
|
3143
|
+
await this.memory.save(trajectory);
|
|
3144
|
+
await this.append({ op: "save", trajectory });
|
|
3145
|
+
}
|
|
3146
|
+
async get(id) {
|
|
3147
|
+
await this.load();
|
|
3148
|
+
return this.memory.get(id);
|
|
3149
|
+
}
|
|
3150
|
+
async list(filter = {}) {
|
|
3151
|
+
await this.load();
|
|
3152
|
+
return this.memory.list(filter);
|
|
3153
|
+
}
|
|
3154
|
+
async appendAttempt(id, attempt) {
|
|
3155
|
+
await this.load();
|
|
3156
|
+
const next = await this.memory.appendAttempt(id, attempt);
|
|
3157
|
+
await this.append({ op: "appendAttempt", id, attempt });
|
|
3158
|
+
return next;
|
|
3159
|
+
}
|
|
3160
|
+
async appendLabel(id, label, attemptId) {
|
|
3161
|
+
await this.load();
|
|
3162
|
+
const next = await this.memory.appendLabel(id, label, attemptId);
|
|
3163
|
+
await this.append({ op: "appendLabel", id, label, attemptId });
|
|
3164
|
+
return next;
|
|
3165
|
+
}
|
|
3166
|
+
async append(record) {
|
|
3167
|
+
const { appendFile, mkdir } = await import("fs/promises");
|
|
3168
|
+
const { join: join4 } = await import("path");
|
|
3169
|
+
await mkdir(this.dir, { recursive: true });
|
|
3170
|
+
await appendFile(
|
|
3171
|
+
join4(this.dir, "feedback-trajectories.ndjson"),
|
|
3172
|
+
`${JSON.stringify(record)}
|
|
3173
|
+
`,
|
|
3174
|
+
"utf8"
|
|
3175
|
+
);
|
|
3176
|
+
}
|
|
3177
|
+
async load() {
|
|
3178
|
+
if (this.loaded) return;
|
|
3179
|
+
const { readFile } = await import("fs/promises");
|
|
3180
|
+
const { join: join4 } = await import("path");
|
|
3181
|
+
const file = join4(this.dir, "feedback-trajectories.ndjson");
|
|
3182
|
+
try {
|
|
3183
|
+
const raw = await readFile(file, "utf8");
|
|
3184
|
+
for (const line of raw.split("\n")) {
|
|
3185
|
+
if (!line.trim()) continue;
|
|
3186
|
+
try {
|
|
3187
|
+
const record = JSON.parse(line);
|
|
3188
|
+
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
3189
|
+
if (record.op === "appendAttempt")
|
|
3190
|
+
await this.memory.appendAttempt(record.id, record.attempt);
|
|
3191
|
+
if (record.op === "appendLabel")
|
|
3192
|
+
await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
3193
|
+
} catch {
|
|
3194
|
+
}
|
|
3195
|
+
}
|
|
3196
|
+
} catch {
|
|
3197
|
+
}
|
|
3198
|
+
this.loaded = true;
|
|
3199
|
+
}
|
|
3200
|
+
};
|
|
3201
|
+
function createFeedbackTrajectory(input) {
|
|
3202
|
+
const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
3203
|
+
const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
|
|
3123
3204
|
return {
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3205
|
+
id,
|
|
3206
|
+
projectId: input.projectId,
|
|
3207
|
+
scenarioId: input.scenarioId,
|
|
3208
|
+
task: input.task,
|
|
3209
|
+
attempts: input.attempts ?? [],
|
|
3210
|
+
labels: input.labels ?? [],
|
|
3211
|
+
outcome: input.outcome,
|
|
3212
|
+
split: input.split,
|
|
3213
|
+
tags: input.tags,
|
|
3214
|
+
createdAt,
|
|
3215
|
+
metadata: input.metadata
|
|
3130
3216
|
};
|
|
3131
3217
|
}
|
|
3132
|
-
function
|
|
3133
|
-
const
|
|
3134
|
-
const
|
|
3135
|
-
|
|
3136
|
-
const
|
|
3218
|
+
function assignFeedbackSplit(trajectory, policy = {}) {
|
|
3219
|
+
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
3220
|
+
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
3221
|
+
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
3222
|
+
const bucket = stableHash(
|
|
3223
|
+
`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`
|
|
3224
|
+
) % total;
|
|
3225
|
+
if (bucket < split.trainPct) return "train";
|
|
3226
|
+
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
3227
|
+
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
3228
|
+
return "holdout";
|
|
3229
|
+
}
|
|
3230
|
+
function withAssignedFeedbackSplit(trajectory, policy) {
|
|
3137
3231
|
return {
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
...input.actionId ? { actionId: input.actionId } : {},
|
|
3141
|
-
status,
|
|
3142
|
-
missingConnections,
|
|
3143
|
-
missingScopes,
|
|
3144
|
-
requiredScopes,
|
|
3145
|
-
missing: resolutionMissingItems(input, missingConnections, missingScopes, requiredScopes),
|
|
3146
|
-
optionalMissing: [],
|
|
3147
|
-
ready: status === "ready" ? [
|
|
3148
|
-
{
|
|
3149
|
-
status: "ready",
|
|
3150
|
-
connectorId: input.connectorId,
|
|
3151
|
-
...input.actionId ? { actionId: input.actionId } : {},
|
|
3152
|
-
requiredScopes
|
|
3153
|
-
}
|
|
3154
|
-
] : [],
|
|
3155
|
-
approvalRequired: input.approvalRequired ?? false,
|
|
3156
|
-
...input.reason ? { reason: input.reason } : {},
|
|
3157
|
-
...input.metadata ? { metadata: input.metadata } : {}
|
|
3232
|
+
...trajectory,
|
|
3233
|
+
split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
|
|
3158
3234
|
};
|
|
3159
3235
|
}
|
|
3160
|
-
function
|
|
3236
|
+
function feedbackTrajectoryToDatasetScenario(trajectory) {
|
|
3237
|
+
const withSplit = withAssignedFeedbackSplit(trajectory);
|
|
3161
3238
|
return {
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3239
|
+
id: withSplit.scenarioId ?? withSplit.id,
|
|
3240
|
+
split: withSplit.split,
|
|
3241
|
+
payload: withSplit,
|
|
3242
|
+
tags: {
|
|
3243
|
+
...withSplit.projectId ? { projectId: withSplit.projectId } : {},
|
|
3244
|
+
...withSplit.tags ?? {},
|
|
3245
|
+
source: "feedback-trajectory"
|
|
3246
|
+
}
|
|
3170
3247
|
};
|
|
3171
3248
|
}
|
|
3172
|
-
function
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
})
|
|
3195
|
-
);
|
|
3196
|
-
const missingScopes = input.missingScopes ?? [];
|
|
3197
|
-
evals.push(
|
|
3198
|
-
objectiveEval({
|
|
3199
|
-
id: `integration-scopes-ready:${input.connectorId}`,
|
|
3200
|
-
passed: missingScopes.length === 0,
|
|
3201
|
-
score: missingScopes.length === 0 ? 1 : 0,
|
|
3202
|
-
severity: missingScopes.length === 0 ? "info" : "critical",
|
|
3203
|
-
detail: missingScopes.length === 0 ? "Required integration scopes are granted." : `Missing integration scope(s): ${missingScopes.join(", ")}`,
|
|
3204
|
-
evidence: missingScopes.join(", ") || void 0,
|
|
3205
|
-
metadata: {
|
|
3206
|
-
connectorId: input.connectorId,
|
|
3207
|
-
missingScopes,
|
|
3208
|
-
requiredScopes: input.requiredScopes ?? []
|
|
3209
|
-
}
|
|
3210
|
-
})
|
|
3211
|
-
);
|
|
3212
|
-
if (input.approvalRequired) {
|
|
3213
|
-
evals.push(
|
|
3214
|
-
objectiveEval({
|
|
3215
|
-
id: `integration-approval-required:${input.connectorId}`,
|
|
3216
|
-
passed: false,
|
|
3217
|
-
score: 0,
|
|
3218
|
-
severity: "warning",
|
|
3219
|
-
detail: "Integration action requires approval before execution.",
|
|
3220
|
-
metadata: { connectorId: input.connectorId, actionId: input.actionId }
|
|
3221
|
-
})
|
|
3222
|
-
);
|
|
3223
|
-
}
|
|
3224
|
-
return evals;
|
|
3249
|
+
function feedbackTrajectoriesToDatasetScenarios(trajectories) {
|
|
3250
|
+
return trajectories.map(feedbackTrajectoryToDatasetScenario);
|
|
3251
|
+
}
|
|
3252
|
+
function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
3253
|
+
const labels = allLabels(trajectory);
|
|
3254
|
+
return {
|
|
3255
|
+
scenarioId: trajectory.scenarioId ?? trajectory.id,
|
|
3256
|
+
trajectoryId: trajectory.id,
|
|
3257
|
+
labelKinds: [...new Set(labels.map((label) => label.kind))],
|
|
3258
|
+
score: trajectory.outcome?.score ?? scoreFromLabels(labels),
|
|
3259
|
+
metadata: {
|
|
3260
|
+
projectId: trajectory.projectId,
|
|
3261
|
+
split: trajectory.split,
|
|
3262
|
+
intent: trajectory.task.intent,
|
|
3263
|
+
attempts: trajectory.attempts.length,
|
|
3264
|
+
outcome: trajectory.outcome,
|
|
3265
|
+
labels
|
|
3266
|
+
}
|
|
3267
|
+
};
|
|
3268
|
+
}
|
|
3269
|
+
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
3270
|
+
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
3225
3271
|
}
|
|
3226
|
-
function
|
|
3227
|
-
|
|
3272
|
+
async function replayFeedbackTrajectory(trajectory, adapter) {
|
|
3273
|
+
try {
|
|
3274
|
+
const result = await adapter.replay(trajectory);
|
|
3228
3275
|
return {
|
|
3229
|
-
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3276
|
+
trajectoryId: trajectory.id,
|
|
3277
|
+
...result
|
|
3278
|
+
};
|
|
3279
|
+
} catch (err) {
|
|
3280
|
+
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3281
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3282
|
+
return {
|
|
3283
|
+
trajectoryId: trajectory.id,
|
|
3284
|
+
pass: false,
|
|
3285
|
+
labels: [
|
|
3286
|
+
{
|
|
3287
|
+
source: "system",
|
|
3288
|
+
kind: "reject",
|
|
3289
|
+
value: false,
|
|
3290
|
+
reason: message,
|
|
3291
|
+
severity: "error",
|
|
3292
|
+
createdAt
|
|
3293
|
+
}
|
|
3294
|
+
],
|
|
3295
|
+
outcome: {
|
|
3296
|
+
success: false,
|
|
3297
|
+
score: 0,
|
|
3298
|
+
detail: message,
|
|
3299
|
+
observedAt: createdAt
|
|
3300
|
+
},
|
|
3301
|
+
metadata: { replayError: true }
|
|
3235
3302
|
};
|
|
3236
3303
|
}
|
|
3237
|
-
const missingConnections = input.missingConnections ?? [];
|
|
3238
|
-
const missingScopes = input.missingScopes ?? [];
|
|
3239
|
-
const surface = !input.valid ? "integration-manifest" : missingConnections.length > 0 ? "integration-connection" : missingScopes.length > 0 ? "integration-scope" : input.approvalRequired ? "integration-approval" : "integration-policy";
|
|
3240
|
-
return {
|
|
3241
|
-
expectationId: `integration-ready:${input.connectorId}${input.actionId ? `:${input.actionId}` : ""}`,
|
|
3242
|
-
message: input.reason ?? messageForManifest(input),
|
|
3243
|
-
severity: input.valid && missingConnections.length === 0 && missingScopes.length === 0 && !input.approvalRequired ? "info" : "error",
|
|
3244
|
-
responsibleSurface: surface,
|
|
3245
|
-
suggestion: suggestionForManifest(input),
|
|
3246
|
-
metadata: { integration: input }
|
|
3247
|
-
};
|
|
3248
3304
|
}
|
|
3249
|
-
function
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3305
|
+
async function replayFeedbackTrajectories(trajectories, adapter) {
|
|
3306
|
+
const results = [];
|
|
3307
|
+
for (const trajectory of trajectories) {
|
|
3308
|
+
results.push(await replayFeedbackTrajectory(trajectory, adapter));
|
|
3309
|
+
}
|
|
3310
|
+
return results;
|
|
3311
|
+
}
|
|
3312
|
+
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
3313
|
+
const maxEntries = options.maxEntries ?? 20;
|
|
3314
|
+
const entries = [];
|
|
3315
|
+
for (const trajectory of trajectories) {
|
|
3316
|
+
for (const label of allLabels(trajectory)) {
|
|
3317
|
+
const instruction = instructionFromLabel(trajectory, label);
|
|
3318
|
+
if (!instruction) continue;
|
|
3319
|
+
entries.push({
|
|
3320
|
+
instruction,
|
|
3321
|
+
rationale: label.reason ?? `${label.kind} label from ${label.source}`,
|
|
3322
|
+
weight: weightForLabel(label),
|
|
3323
|
+
sourceTrajectoryId: trajectory.id,
|
|
3324
|
+
sourceLabelId: label.id,
|
|
3325
|
+
category: label.kind
|
|
3326
|
+
});
|
|
3327
|
+
}
|
|
3328
|
+
}
|
|
3329
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
3330
|
+
for (const entry of entries) {
|
|
3331
|
+
const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
|
|
3332
|
+
const existing = byInstruction.get(key);
|
|
3333
|
+
if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
|
|
3334
|
+
}
|
|
3335
|
+
return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
|
|
3336
|
+
}
|
|
3337
|
+
function renderPreferenceMemoryMarkdown(entries) {
|
|
3338
|
+
const lines = ["# Preference Memory", ""];
|
|
3339
|
+
for (const entry of entries) {
|
|
3340
|
+
lines.push(`- ${entry.instruction}`);
|
|
3341
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
3342
|
+
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
3343
|
+
lines.push("");
|
|
3344
|
+
}
|
|
3345
|
+
return `${lines.join("\n").trim()}
|
|
3346
|
+
`;
|
|
3347
|
+
}
|
|
3348
|
+
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
3349
|
+
return `${trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize2(trajectory))).join("\n")}
|
|
3350
|
+
`;
|
|
3351
|
+
}
|
|
3352
|
+
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
3353
|
+
const trajectories = [];
|
|
3354
|
+
for (const line of jsonl.split("\n")) {
|
|
3355
|
+
if (!line.trim()) continue;
|
|
3356
|
+
trajectories.push(JSON.parse(line));
|
|
3357
|
+
}
|
|
3358
|
+
return trajectories;
|
|
3359
|
+
}
|
|
3360
|
+
function controlRunToFeedbackTrajectory(run, options = {}) {
|
|
3361
|
+
const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
3362
|
+
const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
|
|
3363
|
+
return createFeedbackTrajectory({
|
|
3364
|
+
id: trajectoryId,
|
|
3365
|
+
projectId: options.projectId,
|
|
3366
|
+
scenarioId: options.scenarioId,
|
|
3367
|
+
task: { intent: run.intent },
|
|
3368
|
+
createdAt,
|
|
3369
|
+
attempts: run.steps.map((step) => ({
|
|
3370
|
+
id: `${trajectoryId}_step_${step.index}`,
|
|
3371
|
+
stepIndex: step.index,
|
|
3372
|
+
artifactType: options.artifactType ?? "action",
|
|
3373
|
+
artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
|
|
3374
|
+
proposedAction: options.proposedActionFromStep?.(step),
|
|
3375
|
+
evals: step.evalsAfter,
|
|
3376
|
+
createdAt: step.startedAt,
|
|
3377
|
+
metadata: {
|
|
3378
|
+
decision: step.decision,
|
|
3379
|
+
actionOutcome: step.actionOutcome
|
|
3380
|
+
}
|
|
3381
|
+
})),
|
|
3382
|
+
labels: [
|
|
3383
|
+
{
|
|
3384
|
+
source: "system",
|
|
3385
|
+
kind: run.pass ? "approve" : "reject",
|
|
3386
|
+
value: run.pass,
|
|
3387
|
+
reason: run.reason,
|
|
3388
|
+
severity: run.pass ? "info" : "error",
|
|
3389
|
+
createdAt
|
|
3390
|
+
}
|
|
3391
|
+
],
|
|
3392
|
+
outcome: {
|
|
3393
|
+
success: run.pass,
|
|
3394
|
+
score: run.score,
|
|
3395
|
+
costUsd: run.spentCostUsd,
|
|
3396
|
+
detail: run.reason,
|
|
3397
|
+
observedAt: createdAt,
|
|
3398
|
+
metadata: {
|
|
3399
|
+
stoppedBy: run.stoppedBy,
|
|
3400
|
+
failureClass: run.failureClass
|
|
3401
|
+
}
|
|
3271
3402
|
}
|
|
3403
|
+
});
|
|
3404
|
+
}
|
|
3405
|
+
function allLabels(trajectory) {
|
|
3406
|
+
const labels = [
|
|
3407
|
+
...trajectory.labels,
|
|
3408
|
+
...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
|
|
3272
3409
|
];
|
|
3410
|
+
const seen = /* @__PURE__ */ new Set();
|
|
3411
|
+
return labels.filter((label) => {
|
|
3412
|
+
const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
|
|
3413
|
+
if (seen.has(key)) return false;
|
|
3414
|
+
seen.add(key);
|
|
3415
|
+
return true;
|
|
3416
|
+
});
|
|
3417
|
+
}
|
|
3418
|
+
function scoreFromLabels(labels) {
|
|
3419
|
+
if (!labels.length) return void 0;
|
|
3420
|
+
const scored = labels.map((label) => {
|
|
3421
|
+
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
3422
|
+
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
3423
|
+
if (label.kind === "rate" && typeof label.value === "number")
|
|
3424
|
+
return Math.max(0, Math.min(1, label.value));
|
|
3425
|
+
return void 0;
|
|
3426
|
+
}).filter((value) => typeof value === "number");
|
|
3427
|
+
if (!scored.length) return void 0;
|
|
3428
|
+
return Math.round(scored.reduce((sum3, value) => sum3 + value, 0) / scored.length * 1e3) / 1e3;
|
|
3429
|
+
}
|
|
3430
|
+
function instructionFromLabel(trajectory, label) {
|
|
3431
|
+
if (label.kind === "reject" && label.reason)
|
|
3432
|
+
return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
3433
|
+
if (label.kind === "revision_request" && label.reason)
|
|
3434
|
+
return `Revise similar work by applying: ${label.reason}`;
|
|
3435
|
+
if (label.kind === "select" && label.reason)
|
|
3436
|
+
return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
3437
|
+
if (label.kind === "approve" && label.reason)
|
|
3438
|
+
return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
3439
|
+
if (label.kind === "comment" && label.reason) return label.reason;
|
|
3440
|
+
return void 0;
|
|
3441
|
+
}
|
|
3442
|
+
function weightForLabel(label) {
|
|
3443
|
+
const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
|
|
3444
|
+
const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
|
|
3445
|
+
return severity * source;
|
|
3446
|
+
}
|
|
3447
|
+
function matchesFilter(trajectory, filter) {
|
|
3448
|
+
if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
|
|
3449
|
+
if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
|
|
3450
|
+
if (filter.split && trajectory.split !== filter.split) return false;
|
|
3451
|
+
if (filter.tag) {
|
|
3452
|
+
const [key, value] = filter.tag;
|
|
3453
|
+
if (trajectory.tags?.[key] !== value) return false;
|
|
3454
|
+
}
|
|
3455
|
+
return true;
|
|
3456
|
+
}
|
|
3457
|
+
function cloneTrajectory(trajectory) {
|
|
3458
|
+
return JSON.parse(JSON.stringify(trajectory));
|
|
3273
3459
|
}
|
|
3274
|
-
function
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
|
|
3286
|
-
|
|
3287
|
-
if (
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
|
|
3292
|
-
|
|
3293
|
-
|
|
3294
|
-
return `Fix the integration manifest for ${input.connectorId}.${input.actionId}.`;
|
|
3295
|
-
return `Retry or degrade gracefully after ${input.connectorId} provider failure.`;
|
|
3296
|
-
}
|
|
3297
|
-
function messageForManifest(input) {
|
|
3298
|
-
if (!input.valid) return `Integration manifest for ${input.connectorId} is invalid.`;
|
|
3299
|
-
if ((input.missingConnections?.length ?? 0) > 0)
|
|
3300
|
-
return `Missing connection for ${input.connectorId}.`;
|
|
3301
|
-
if ((input.missingScopes?.length ?? 0) > 0)
|
|
3302
|
-
return `Missing required scopes for ${input.connectorId}.`;
|
|
3303
|
-
if (input.approvalRequired)
|
|
3304
|
-
return `Approval required for ${input.connectorId}${input.actionId ? `.${input.actionId}` : ""}.`;
|
|
3305
|
-
return `${input.connectorId} is ready.`;
|
|
3306
|
-
}
|
|
3307
|
-
function suggestionForManifest(input) {
|
|
3308
|
-
if (!input.valid) return "Fix or regenerate the integration manifest before running the agent.";
|
|
3309
|
-
if ((input.missingConnections?.length ?? 0) > 0)
|
|
3310
|
-
return `Connect ${input.missingConnections.join(", ")} before replaying the workflow.`;
|
|
3311
|
-
if ((input.missingScopes?.length ?? 0) > 0)
|
|
3312
|
-
return `Request scopes: ${input.missingScopes.join(", ")}.`;
|
|
3313
|
-
if (input.approvalRequired) return "Create an approval request and replay after approval.";
|
|
3314
|
-
return "No action required.";
|
|
3460
|
+
function compact(value, max) {
|
|
3461
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
3462
|
+
return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
|
|
3463
|
+
}
|
|
3464
|
+
function stableHash(input) {
|
|
3465
|
+
let hash = 2166136261;
|
|
3466
|
+
for (let i = 0; i < input.length; i += 1) {
|
|
3467
|
+
hash ^= input.charCodeAt(i);
|
|
3468
|
+
hash = Math.imul(hash, 16777619);
|
|
3469
|
+
}
|
|
3470
|
+
return hash >>> 0;
|
|
3471
|
+
}
|
|
3472
|
+
function canonicalize2(value) {
|
|
3473
|
+
if (value === null || typeof value !== "object") return value;
|
|
3474
|
+
if (Array.isArray(value)) return value.map(canonicalize2);
|
|
3475
|
+
const out = {};
|
|
3476
|
+
for (const key of Object.keys(value).sort()) {
|
|
3477
|
+
out[key] = canonicalize2(value[key]);
|
|
3478
|
+
}
|
|
3479
|
+
return out;
|
|
3315
3480
|
}
|
|
3316
3481
|
|
|
3317
3482
|
// src/integrity/backend-integrity.ts
|
|
@@ -3796,9 +3961,9 @@ function scorePrReviewComments(auditCase, comments, source, weights = {}) {
|
|
|
3796
3961
|
})
|
|
3797
3962
|
};
|
|
3798
3963
|
}
|
|
3799
|
-
function summarizePrReviewBenchmark(
|
|
3964
|
+
function summarizePrReviewBenchmark(scores2) {
|
|
3800
3965
|
const bySource = /* @__PURE__ */ new Map();
|
|
3801
|
-
for (const score of
|
|
3966
|
+
for (const score of scores2) {
|
|
3802
3967
|
bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
|
|
3803
3968
|
}
|
|
3804
3969
|
return [...bySource.entries()].map(([source, sourceScores]) => ({
|
|
@@ -3913,396 +4078,6 @@ function sum(values) {
|
|
|
3913
4078
|
return values.reduce((total, value) => total + value, 0);
|
|
3914
4079
|
}
|
|
3915
4080
|
|
|
3916
|
-
// src/production-loop.ts
|
|
3917
|
-
async function runProductionLoop(opts) {
|
|
3918
|
-
validate2(opts);
|
|
3919
|
-
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
3920
|
-
const startedAt = now().toISOString();
|
|
3921
|
-
const observedRuns = await opts.traceStore.listRuns();
|
|
3922
|
-
const observedFeedback = await opts.feedbackStore.list();
|
|
3923
|
-
const clusterReport = await failureClusterView(opts.traceStore, {
|
|
3924
|
-
minClusterSize: opts.cluster.minClusterSize ?? 1
|
|
3925
|
-
});
|
|
3926
|
-
const minSize = opts.cluster.minClusterSize ?? 5;
|
|
3927
|
-
const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
|
|
3928
|
-
const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
|
|
3929
|
-
const totalRuns = clusterReport.totalRuns;
|
|
3930
|
-
const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
|
|
3931
|
-
if (actionable.length === 0) {
|
|
3932
|
-
return finalize({
|
|
3933
|
-
opts,
|
|
3934
|
-
decision: "no_actionable_failures",
|
|
3935
|
-
startedAt,
|
|
3936
|
-
now,
|
|
3937
|
-
observedRunCount: observedRuns.length,
|
|
3938
|
-
observedFeedbackCount: observedFeedback.length,
|
|
3939
|
-
clusters: clusterReport.clusters,
|
|
3940
|
-
actedOnCluster: null,
|
|
3941
|
-
evolution: null,
|
|
3942
|
-
release: null,
|
|
3943
|
-
gate: null,
|
|
3944
|
-
promotedPrompt: opts.evolve.baselinePrompt,
|
|
3945
|
-
pullRequest: null
|
|
3946
|
-
});
|
|
3947
|
-
}
|
|
3948
|
-
const actedOn = actionable[0];
|
|
3949
|
-
const baseline = {
|
|
3950
|
-
id: opts.evolve.baselineId ?? "baseline",
|
|
3951
|
-
label: opts.evolve.baselineId ?? "baseline",
|
|
3952
|
-
generation: 0,
|
|
3953
|
-
payload: opts.evolve.baselinePrompt
|
|
3954
|
-
};
|
|
3955
|
-
const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
|
|
3956
|
-
const searchIds = uniqueIds(
|
|
3957
|
-
(opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
|
|
3958
|
-
(s) => s.id
|
|
3959
|
-
)
|
|
3960
|
-
);
|
|
3961
|
-
if (searchIds.some((id) => holdoutIds.includes(id))) {
|
|
3962
|
-
throw new ValidationError(
|
|
3963
|
-
"runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
|
|
3964
|
-
);
|
|
3965
|
-
}
|
|
3966
|
-
const reps = opts.evolve.reps ?? 3;
|
|
3967
|
-
const generations = opts.evolve.generations ?? 3;
|
|
3968
|
-
const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
|
|
3969
|
-
const evolution = await runMultiShotOptimization({
|
|
3970
|
-
runId: `${opts.runId}/evolve`,
|
|
3971
|
-
target: opts.target,
|
|
3972
|
-
seedVariants: [baseline],
|
|
3973
|
-
searchScenarioIds: searchIds,
|
|
3974
|
-
reps,
|
|
3975
|
-
generations,
|
|
3976
|
-
populationSize,
|
|
3977
|
-
scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
|
|
3978
|
-
runner: opts.evolve.runner,
|
|
3979
|
-
scorer: opts.evolve.scorer,
|
|
3980
|
-
mutateAdapter: opts.evolve.mutator,
|
|
3981
|
-
gate: {
|
|
3982
|
-
holdoutScenarioIds: holdoutIds,
|
|
3983
|
-
reps,
|
|
3984
|
-
gate: { ...opts.evolve.gate, baselineKey: baseline.id },
|
|
3985
|
-
toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
|
|
3986
|
-
runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
|
|
3987
|
-
variant,
|
|
3988
|
-
scenarioId,
|
|
3989
|
-
rep,
|
|
3990
|
-
split,
|
|
3991
|
-
seed,
|
|
3992
|
-
trial,
|
|
3993
|
-
target: opts.target
|
|
3994
|
-
}))
|
|
3995
|
-
}
|
|
3996
|
-
});
|
|
3997
|
-
const gate = evolution.gate?.decision ?? null;
|
|
3998
|
-
const promotedVariant = evolution.promotedVariant;
|
|
3999
|
-
const promoted = promotedVariant.payload;
|
|
4000
|
-
const promotedChanged = promotedVariant.id !== baseline.id;
|
|
4001
|
-
const allTrials = evolution.evolution.generations.flatMap(
|
|
4002
|
-
(g) => g.trials
|
|
4003
|
-
);
|
|
4004
|
-
const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
|
|
4005
|
-
const releaseScenarios = [
|
|
4006
|
-
...(opts.evolve.searchScenarios ?? []).map((s) => ({
|
|
4007
|
-
id: s.id,
|
|
4008
|
-
payload: s,
|
|
4009
|
-
split: "train",
|
|
4010
|
-
tags: { persona: s.persona, label: s.label }
|
|
4011
|
-
})),
|
|
4012
|
-
...opts.evolve.holdoutScenarios.map((s) => ({
|
|
4013
|
-
id: s.id,
|
|
4014
|
-
payload: s,
|
|
4015
|
-
split: "holdout",
|
|
4016
|
-
tags: { persona: s.persona, label: s.label }
|
|
4017
|
-
}))
|
|
4018
|
-
];
|
|
4019
|
-
const release = evaluateReleaseConfidence({
|
|
4020
|
-
target: opts.target,
|
|
4021
|
-
candidateId: promotedVariant.id,
|
|
4022
|
-
baselineId: baseline.id,
|
|
4023
|
-
scenarios: releaseScenarios,
|
|
4024
|
-
traces: traceEvidence,
|
|
4025
|
-
gateDecision: gate ?? void 0,
|
|
4026
|
-
thresholds: opts.releaseThresholds,
|
|
4027
|
-
runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
|
|
4028
|
-
});
|
|
4029
|
-
if (!promotedChanged) {
|
|
4030
|
-
return finalize({
|
|
4031
|
-
opts,
|
|
4032
|
-
decision: "evolve_yielded_no_improvement",
|
|
4033
|
-
startedAt,
|
|
4034
|
-
now,
|
|
4035
|
-
observedRunCount: observedRuns.length,
|
|
4036
|
-
observedFeedbackCount: observedFeedback.length,
|
|
4037
|
-
clusters: clusterReport.clusters,
|
|
4038
|
-
actedOnCluster: actedOn,
|
|
4039
|
-
evolution,
|
|
4040
|
-
release,
|
|
4041
|
-
gate,
|
|
4042
|
-
promotedPrompt: promoted,
|
|
4043
|
-
pullRequest: null
|
|
4044
|
-
});
|
|
4045
|
-
}
|
|
4046
|
-
if (release.status === "fail" || gate && !gate.promote) {
|
|
4047
|
-
return finalize({
|
|
4048
|
-
opts,
|
|
4049
|
-
decision: "gate_failed",
|
|
4050
|
-
startedAt,
|
|
4051
|
-
now,
|
|
4052
|
-
observedRunCount: observedRuns.length,
|
|
4053
|
-
observedFeedbackCount: observedFeedback.length,
|
|
4054
|
-
clusters: clusterReport.clusters,
|
|
4055
|
-
actedOnCluster: actedOn,
|
|
4056
|
-
evolution,
|
|
4057
|
-
release,
|
|
4058
|
-
gate,
|
|
4059
|
-
promotedPrompt: promoted,
|
|
4060
|
-
pullRequest: null
|
|
4061
|
-
});
|
|
4062
|
-
}
|
|
4063
|
-
if (!opts.ship) {
|
|
4064
|
-
return finalize({
|
|
4065
|
-
opts,
|
|
4066
|
-
decision: "proposed_change",
|
|
4067
|
-
startedAt,
|
|
4068
|
-
now,
|
|
4069
|
-
observedRunCount: observedRuns.length,
|
|
4070
|
-
observedFeedbackCount: observedFeedback.length,
|
|
4071
|
-
clusters: clusterReport.clusters,
|
|
4072
|
-
actedOnCluster: actedOn,
|
|
4073
|
-
evolution,
|
|
4074
|
-
release,
|
|
4075
|
-
gate,
|
|
4076
|
-
promotedPrompt: promoted,
|
|
4077
|
-
pullRequest: null
|
|
4078
|
-
});
|
|
4079
|
-
}
|
|
4080
|
-
const baselineStr = toPromptString(baseline.payload);
|
|
4081
|
-
const promotedStr = toPromptString(promoted);
|
|
4082
|
-
const ctx = {
|
|
4083
|
-
runId: opts.runId,
|
|
4084
|
-
target: opts.target,
|
|
4085
|
-
decision: "pr_opened",
|
|
4086
|
-
clusters: clusterReport.clusters,
|
|
4087
|
-
actedOnCluster: actedOn,
|
|
4088
|
-
observedRunCount: observedRuns.length,
|
|
4089
|
-
observedFeedbackCount: observedFeedback.length,
|
|
4090
|
-
evolution,
|
|
4091
|
-
release,
|
|
4092
|
-
gate,
|
|
4093
|
-
baselinePromptString: baselineStr,
|
|
4094
|
-
promotedPromptString: promotedStr
|
|
4095
|
-
};
|
|
4096
|
-
const renderBody = opts.ship.renderBody ?? defaultRenderBody;
|
|
4097
|
-
const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
|
|
4098
|
-
`);
|
|
4099
|
-
const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
|
|
4100
|
-
const pr = await proposeAutomatedPullRequest(opts.ship.client, {
|
|
4101
|
-
repo: opts.ship.repo,
|
|
4102
|
-
baseBranch: opts.ship.baseBranch ?? "main",
|
|
4103
|
-
branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
|
|
4104
|
-
title: `${opts.target}: production-loop prompt update (${opts.runId})`,
|
|
4105
|
-
body: renderBody(ctx),
|
|
4106
|
-
reviewers: opts.ship.reviewers,
|
|
4107
|
-
labels: opts.ship.labels,
|
|
4108
|
-
fileChanges: [
|
|
4109
|
-
{
|
|
4110
|
-
path: opts.ship.promptFilePath,
|
|
4111
|
-
contents: renderFile(promotedStr, currentFile),
|
|
4112
|
-
rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
|
|
4113
|
-
}
|
|
4114
|
-
],
|
|
4115
|
-
dryRun: opts.ship.dryRun
|
|
4116
|
-
});
|
|
4117
|
-
return finalize({
|
|
4118
|
-
opts,
|
|
4119
|
-
decision: "pr_opened",
|
|
4120
|
-
startedAt,
|
|
4121
|
-
now,
|
|
4122
|
-
observedRunCount: observedRuns.length,
|
|
4123
|
-
observedFeedbackCount: observedFeedback.length,
|
|
4124
|
-
clusters: clusterReport.clusters,
|
|
4125
|
-
actedOnCluster: actedOn,
|
|
4126
|
-
evolution,
|
|
4127
|
-
release,
|
|
4128
|
-
gate,
|
|
4129
|
-
promotedPrompt: promoted,
|
|
4130
|
-
pullRequest: pr
|
|
4131
|
-
});
|
|
4132
|
-
}
|
|
4133
|
-
function finalize(args) {
|
|
4134
|
-
return {
|
|
4135
|
-
runId: args.opts.runId,
|
|
4136
|
-
target: args.opts.target,
|
|
4137
|
-
decision: args.decision,
|
|
4138
|
-
startedAt: args.startedAt,
|
|
4139
|
-
finishedAt: args.now().toISOString(),
|
|
4140
|
-
observedRunCount: args.observedRunCount,
|
|
4141
|
-
observedFeedbackCount: args.observedFeedbackCount,
|
|
4142
|
-
clusters: args.clusters,
|
|
4143
|
-
actedOnCluster: args.actedOnCluster,
|
|
4144
|
-
evolution: args.evolution,
|
|
4145
|
-
release: args.release,
|
|
4146
|
-
gate: args.gate,
|
|
4147
|
-
baselinePrompt: args.opts.evolve.baselinePrompt,
|
|
4148
|
-
promotedPrompt: args.promotedPrompt,
|
|
4149
|
-
pullRequest: args.pullRequest,
|
|
4150
|
-
cron: args.opts.cron ?? null
|
|
4151
|
-
};
|
|
4152
|
-
}
|
|
4153
|
-
function validate2(opts) {
|
|
4154
|
-
if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
|
|
4155
|
-
if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
|
|
4156
|
-
if (opts.evolve.holdoutScenarios.length === 0) {
|
|
4157
|
-
throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
|
|
4158
|
-
}
|
|
4159
|
-
if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
|
|
4160
|
-
throw new ValidationError(
|
|
4161
|
-
"runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
|
|
4162
|
-
);
|
|
4163
|
-
}
|
|
4164
|
-
if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
|
|
4165
|
-
}
|
|
4166
|
-
if (opts.ship) {
|
|
4167
|
-
if (!opts.ship.branchPrefix.trim()) {
|
|
4168
|
-
throw new ValidationError("runProductionLoop: ship.branchPrefix required");
|
|
4169
|
-
}
|
|
4170
|
-
if (!opts.ship.promptFilePath.trim()) {
|
|
4171
|
-
throw new ValidationError("runProductionLoop: ship.promptFilePath required");
|
|
4172
|
-
}
|
|
4173
|
-
}
|
|
4174
|
-
}
|
|
4175
|
-
function uniqueIds(ids) {
|
|
4176
|
-
const seen = /* @__PURE__ */ new Set();
|
|
4177
|
-
const out = [];
|
|
4178
|
-
for (const id of ids) {
|
|
4179
|
-
if (seen.has(id)) continue;
|
|
4180
|
-
seen.add(id);
|
|
4181
|
-
out.push(id);
|
|
4182
|
-
}
|
|
4183
|
-
return out;
|
|
4184
|
-
}
|
|
4185
|
-
function deriveSearchScenarios(holdout) {
|
|
4186
|
-
if (holdout.length < 4) {
|
|
4187
|
-
return [
|
|
4188
|
-
{
|
|
4189
|
-
...holdout[0],
|
|
4190
|
-
id: `${holdout[0].id}__search`
|
|
4191
|
-
}
|
|
4192
|
-
];
|
|
4193
|
-
}
|
|
4194
|
-
return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
|
|
4195
|
-
}
|
|
4196
|
-
function syntheticRunRecord(input) {
|
|
4197
|
-
const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
|
|
4198
|
-
return {
|
|
4199
|
-
runId: input.runId,
|
|
4200
|
-
experimentId: input.target,
|
|
4201
|
-
candidateId: input.variant.id,
|
|
4202
|
-
seed: input.seed,
|
|
4203
|
-
model: "production-loop@synthetic",
|
|
4204
|
-
promptHash: "0".repeat(64),
|
|
4205
|
-
configHash: "0".repeat(64),
|
|
4206
|
-
commitSha: "0".repeat(40),
|
|
4207
|
-
wallMs: input.trial.durationMs ?? 1,
|
|
4208
|
-
costUsd: input.trial.cost ?? 0,
|
|
4209
|
-
tokenUsage: { input: 0, output: 0 },
|
|
4210
|
-
outcome: {
|
|
4211
|
-
[scoreKey]: input.trial.score,
|
|
4212
|
-
raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
|
|
4213
|
-
},
|
|
4214
|
-
splitTag: input.split,
|
|
4215
|
-
scenarioId: input.scenarioId
|
|
4216
|
-
};
|
|
4217
|
-
}
|
|
4218
|
-
function toPromptString(payload) {
|
|
4219
|
-
if (typeof payload === "string") return payload;
|
|
4220
|
-
if (payload == null) return "";
|
|
4221
|
-
try {
|
|
4222
|
-
return JSON.stringify(payload, null, 2);
|
|
4223
|
-
} catch {
|
|
4224
|
-
return String(payload);
|
|
4225
|
-
}
|
|
4226
|
-
}
|
|
4227
|
-
function defaultRenderBody(ctx) {
|
|
4228
|
-
const cluster = ctx.actedOnCluster;
|
|
4229
|
-
const release = ctx.release;
|
|
4230
|
-
const gate = ctx.gate;
|
|
4231
|
-
const lines = [];
|
|
4232
|
-
lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
|
|
4233
|
-
lines.push("");
|
|
4234
|
-
lines.push(`Run id: \`${ctx.runId}\``);
|
|
4235
|
-
lines.push(`Decision: \`${ctx.decision}\``);
|
|
4236
|
-
lines.push(
|
|
4237
|
-
`Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
|
|
4238
|
-
);
|
|
4239
|
-
lines.push("");
|
|
4240
|
-
if (cluster) {
|
|
4241
|
-
lines.push("### Triggering failure cluster");
|
|
4242
|
-
lines.push("");
|
|
4243
|
-
lines.push(`- **class**: \`${cluster.failureClass}\``);
|
|
4244
|
-
lines.push(`- **runs in cluster**: ${cluster.runCount}`);
|
|
4245
|
-
lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
|
|
4246
|
-
if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
|
|
4247
|
-
if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
|
|
4248
|
-
if (cluster.exampleError) {
|
|
4249
|
-
lines.push(
|
|
4250
|
-
`- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
|
|
4251
|
-
);
|
|
4252
|
-
}
|
|
4253
|
-
lines.push("");
|
|
4254
|
-
}
|
|
4255
|
-
if (gate) {
|
|
4256
|
-
lines.push("### Held-out promotion gate");
|
|
4257
|
-
lines.push("");
|
|
4258
|
-
lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
|
|
4259
|
-
lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
|
|
4260
|
-
lines.push(
|
|
4261
|
-
`- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
|
|
4262
|
-
);
|
|
4263
|
-
lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
|
|
4264
|
-
lines.push(
|
|
4265
|
-
`- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
|
|
4266
|
-
);
|
|
4267
|
-
lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
|
|
4268
|
-
lines.push("");
|
|
4269
|
-
}
|
|
4270
|
-
if (release) {
|
|
4271
|
-
lines.push("### Release confidence");
|
|
4272
|
-
lines.push("");
|
|
4273
|
-
lines.push(`- **status**: \`${release.status}\``);
|
|
4274
|
-
lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
|
|
4275
|
-
lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
|
|
4276
|
-
if (release.issues.length > 0) {
|
|
4277
|
-
lines.push("- **issues**:");
|
|
4278
|
-
for (const issue of release.issues) {
|
|
4279
|
-
lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
|
|
4280
|
-
}
|
|
4281
|
-
}
|
|
4282
|
-
lines.push("");
|
|
4283
|
-
}
|
|
4284
|
-
lines.push("### Prompt diff");
|
|
4285
|
-
lines.push("");
|
|
4286
|
-
lines.push("```diff");
|
|
4287
|
-
lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
|
|
4288
|
-
lines.push("```");
|
|
4289
|
-
return lines.join("\n");
|
|
4290
|
-
}
|
|
4291
|
-
function unifiedDiff(a, b) {
|
|
4292
|
-
const aLines = a.split("\n");
|
|
4293
|
-
const bLines = b.split("\n");
|
|
4294
|
-
const out = [];
|
|
4295
|
-
const max = Math.max(aLines.length, bLines.length);
|
|
4296
|
-
for (let i = 0; i < max; i++) {
|
|
4297
|
-
const al = aLines[i];
|
|
4298
|
-
const bl = bLines[i];
|
|
4299
|
-
if (al === bl) continue;
|
|
4300
|
-
if (al !== void 0) out.push(`- ${al}`);
|
|
4301
|
-
if (bl !== void 0) out.push(`+ ${bl}`);
|
|
4302
|
-
}
|
|
4303
|
-
return out.join("\n");
|
|
4304
|
-
}
|
|
4305
|
-
|
|
4306
4081
|
// src/registry.ts
|
|
4307
4082
|
var ScenarioRegistry = class {
|
|
4308
4083
|
scenarios = [];
|
|
@@ -5395,6 +5170,89 @@ var FileSystemExperimentStore = class {
|
|
|
5395
5170
|
}
|
|
5396
5171
|
};
|
|
5397
5172
|
|
|
5173
|
+
// src/pareto.ts
|
|
5174
|
+
function dominates(a, b, objectives) {
|
|
5175
|
+
let strictlyBetter = false;
|
|
5176
|
+
for (const obj of objectives) {
|
|
5177
|
+
const av = obj.value(a);
|
|
5178
|
+
const bv = obj.value(b);
|
|
5179
|
+
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
5180
|
+
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
5181
|
+
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
5182
|
+
if (aIsWorse) return false;
|
|
5183
|
+
if (aIsBetter) strictlyBetter = true;
|
|
5184
|
+
}
|
|
5185
|
+
return strictlyBetter;
|
|
5186
|
+
}
|
|
5187
|
+
function paretoFrontier(candidates, objectives) {
|
|
5188
|
+
if (objectives.length === 0) {
|
|
5189
|
+
throw new Error("paretoFrontier: at least 1 objective required");
|
|
5190
|
+
}
|
|
5191
|
+
const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
|
|
5192
|
+
const frontier = [];
|
|
5193
|
+
const dominated = [];
|
|
5194
|
+
for (const c of valid) {
|
|
5195
|
+
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
5196
|
+
if (isDominated) dominated.push(c);
|
|
5197
|
+
else frontier.push(c);
|
|
5198
|
+
}
|
|
5199
|
+
const dominanceMap = frontier.map((d) => ({
|
|
5200
|
+
dominator: d,
|
|
5201
|
+
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
5202
|
+
}));
|
|
5203
|
+
return { frontier, dominated, dominanceMap };
|
|
5204
|
+
}
|
|
5205
|
+
function scalarScore(candidates, objectives, options = {}) {
|
|
5206
|
+
if (candidates.length === 0) return [];
|
|
5207
|
+
const weights = options.weights ?? {};
|
|
5208
|
+
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
5209
|
+
const ranges = objectives.map((obj) => {
|
|
5210
|
+
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
5211
|
+
if (values.length === 0) return { min: 0, max: 1 };
|
|
5212
|
+
const min = Math.min(...values);
|
|
5213
|
+
const max = Math.max(...values);
|
|
5214
|
+
return { min, max: max === min ? min + 1 : max };
|
|
5215
|
+
});
|
|
5216
|
+
return candidates.map((c) => {
|
|
5217
|
+
let score = 0;
|
|
5218
|
+
objectives.forEach((obj, i) => {
|
|
5219
|
+
const v = obj.value(c);
|
|
5220
|
+
if (!Number.isFinite(v)) return;
|
|
5221
|
+
const { min, max } = ranges[i];
|
|
5222
|
+
const normalised = (v - min) / (max - min);
|
|
5223
|
+
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
5224
|
+
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
5225
|
+
score += directional * weight;
|
|
5226
|
+
});
|
|
5227
|
+
return { candidate: c, score };
|
|
5228
|
+
});
|
|
5229
|
+
}
|
|
5230
|
+
function crowdingDistance(candidates, objectives) {
|
|
5231
|
+
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
5232
|
+
for (const obj of objectives) {
|
|
5233
|
+
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
5234
|
+
const min = obj.value(sorted[0]);
|
|
5235
|
+
const max = obj.value(sorted[sorted.length - 1]);
|
|
5236
|
+
const range = max - min || 1;
|
|
5237
|
+
distances.set(sorted[0], Infinity);
|
|
5238
|
+
distances.set(sorted[sorted.length - 1], Infinity);
|
|
5239
|
+
for (let i = 1; i < sorted.length - 1; i++) {
|
|
5240
|
+
const prev = obj.value(sorted[i - 1]);
|
|
5241
|
+
const next = obj.value(sorted[i + 1]);
|
|
5242
|
+
const current = distances.get(sorted[i]);
|
|
5243
|
+
if (current === Infinity) continue;
|
|
5244
|
+
distances.set(sorted[i], current + (next - prev) / range);
|
|
5245
|
+
}
|
|
5246
|
+
}
|
|
5247
|
+
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
5248
|
+
}
|
|
5249
|
+
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
5250
|
+
const { frontier } = paretoFrontier(candidates, objectives);
|
|
5251
|
+
if (frontier.length === 0) return [];
|
|
5252
|
+
const distances = crowdingDistance(frontier, objectives);
|
|
5253
|
+
return distances.sort((a, b) => b.distance - a.distance);
|
|
5254
|
+
}
|
|
5255
|
+
|
|
5398
5256
|
// src/harness-optimizer.ts
|
|
5399
5257
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
5400
5258
|
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
@@ -5485,20 +5343,20 @@ async function mapLimit(items, limit, fn) {
|
|
|
5485
5343
|
function mean2(values) {
|
|
5486
5344
|
return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
|
|
5487
5345
|
}
|
|
5488
|
-
function meanRunScore(
|
|
5346
|
+
function meanRunScore(scores2) {
|
|
5489
5347
|
return {
|
|
5490
|
-
success: mean2(
|
|
5491
|
-
goalProgress: mean2(
|
|
5492
|
-
repoGroundedness: mean2(
|
|
5493
|
-
driftPenalty: mean2(
|
|
5494
|
-
toolUseQuality: mean2(
|
|
5495
|
-
patchQuality: mean2(
|
|
5496
|
-
testReality: mean2(
|
|
5497
|
-
finalGate: mean2(
|
|
5498
|
-
reviewerBlockers: mean2(
|
|
5499
|
-
costUsd: mean2(
|
|
5500
|
-
wallSeconds: mean2(
|
|
5501
|
-
notes:
|
|
5348
|
+
success: mean2(scores2.map((s) => s.success)),
|
|
5349
|
+
goalProgress: mean2(scores2.map((s) => s.goalProgress)),
|
|
5350
|
+
repoGroundedness: mean2(scores2.map((s) => s.repoGroundedness)),
|
|
5351
|
+
driftPenalty: mean2(scores2.map((s) => s.driftPenalty)),
|
|
5352
|
+
toolUseQuality: mean2(scores2.map((s) => s.toolUseQuality)),
|
|
5353
|
+
patchQuality: mean2(scores2.map((s) => s.patchQuality)),
|
|
5354
|
+
testReality: mean2(scores2.map((s) => s.testReality)),
|
|
5355
|
+
finalGate: mean2(scores2.map((s) => s.finalGate)),
|
|
5356
|
+
reviewerBlockers: mean2(scores2.map((s) => s.reviewerBlockers)),
|
|
5357
|
+
costUsd: mean2(scores2.map((s) => s.costUsd)),
|
|
5358
|
+
wallSeconds: mean2(scores2.map((s) => s.wallSeconds)),
|
|
5359
|
+
notes: scores2.flatMap((s) => s.notes ?? [])
|
|
5502
5360
|
};
|
|
5503
5361
|
}
|
|
5504
5362
|
|
|
@@ -6473,12 +6331,12 @@ function recordRuns(runs, opts) {
|
|
|
6473
6331
|
for (const [scenarioId, scenarioRuns] of byScenario) {
|
|
6474
6332
|
const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
|
|
6475
6333
|
if (scored.length === 0) continue;
|
|
6476
|
-
const
|
|
6334
|
+
const scores2 = scored.map((s) => s.score);
|
|
6477
6335
|
const entry = {
|
|
6478
6336
|
commitSha: opts.commitSha,
|
|
6479
6337
|
timestamp,
|
|
6480
|
-
scores,
|
|
6481
|
-
composite: median(
|
|
6338
|
+
scores: scores2,
|
|
6339
|
+
composite: median(scores2),
|
|
6482
6340
|
runIds: scored.map((s) => s.run.runId)
|
|
6483
6341
|
};
|
|
6484
6342
|
const perDimension = aggregatePerDimension(scenarioRuns);
|
|
@@ -6600,17 +6458,17 @@ function formatScorecardDiff(diff) {
|
|
|
6600
6458
|
lines.push(
|
|
6601
6459
|
`Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
|
|
6602
6460
|
);
|
|
6603
|
-
const
|
|
6461
|
+
const fmt2 = (n) => n.toFixed(3);
|
|
6604
6462
|
const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
|
|
6605
6463
|
if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
|
|
6606
6464
|
return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
|
|
6607
6465
|
});
|
|
6608
6466
|
for (const cell of noteworthy) {
|
|
6609
6467
|
const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
|
|
6610
|
-
const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${
|
|
6468
|
+
const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt2(cell.delta)}` : fmt2(cell.delta) : "\u2014";
|
|
6611
6469
|
const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
|
|
6612
6470
|
lines.push(
|
|
6613
|
-
` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${
|
|
6471
|
+
` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt2(cell.baseline ?? 0)} \u2192 ${fmt2(cell.current)} ${deltaStr}${stat}`
|
|
6614
6472
|
);
|
|
6615
6473
|
}
|
|
6616
6474
|
return lines.join("\n");
|
|
@@ -6625,10 +6483,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6625
6483
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6626
6484
|
}
|
|
6627
6485
|
const tail = values.slice(-window);
|
|
6628
|
-
const
|
|
6629
|
-
const variance = tail.reduce((acc, v) => acc + (v -
|
|
6486
|
+
const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6487
|
+
const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
|
|
6630
6488
|
const stdDev = Math.sqrt(variance);
|
|
6631
|
-
const refMean = Math.abs(
|
|
6489
|
+
const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
6632
6490
|
const cv = stdDev / refMean;
|
|
6633
6491
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6634
6492
|
let tailRun = 0;
|
|
@@ -6649,7 +6507,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6649
6507
|
} else {
|
|
6650
6508
|
state = "noisy";
|
|
6651
6509
|
}
|
|
6652
|
-
return { state, windowMean:
|
|
6510
|
+
return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
|
|
6653
6511
|
}
|
|
6654
6512
|
|
|
6655
6513
|
// src/slo.ts
|
|
@@ -7027,12 +6885,12 @@ function renderMarkdownReport(reports) {
|
|
|
7027
6885
|
async function aggregateRunMetrics(runs, store) {
|
|
7028
6886
|
if (runs.length === 0) return {};
|
|
7029
6887
|
const durations = [];
|
|
7030
|
-
const
|
|
6888
|
+
const scores2 = [];
|
|
7031
6889
|
const passes = [];
|
|
7032
6890
|
const costs = [];
|
|
7033
6891
|
for (const r of runs) {
|
|
7034
6892
|
if (r.endedAt) durations.push(r.endedAt - r.startedAt);
|
|
7035
|
-
if (r.outcome?.score !== void 0)
|
|
6893
|
+
if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
|
|
7036
6894
|
passes.push(r.outcome?.pass === true ? 1 : 0);
|
|
7037
6895
|
const llm = await llmSpans(store, r.runId);
|
|
7038
6896
|
costs.push(aggregateLlm(llm).costUsd);
|
|
@@ -7041,7 +6899,7 @@ async function aggregateRunMetrics(runs, store) {
|
|
|
7041
6899
|
provisionMs: average(durations),
|
|
7042
6900
|
firstTokenMs: average(durations),
|
|
7043
6901
|
wallMs: average(durations),
|
|
7044
|
-
overallScore: average(
|
|
6902
|
+
overallScore: average(scores2),
|
|
7045
6903
|
passRate: average(passes),
|
|
7046
6904
|
costUsd: average(costs)
|
|
7047
6905
|
};
|
|
@@ -7205,7 +7063,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
7205
7063
|
},
|
|
7206
7064
|
metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
|
|
7207
7065
|
}));
|
|
7208
|
-
const
|
|
7066
|
+
const scores2 = judges.map((j) => ({
|
|
7209
7067
|
id: j.spanId,
|
|
7210
7068
|
traceId: run.runId,
|
|
7211
7069
|
observationId: j.targetSpanId,
|
|
@@ -7213,7 +7071,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
7213
7071
|
value: j.score,
|
|
7214
7072
|
comment: j.rationale
|
|
7215
7073
|
}));
|
|
7216
|
-
return { traceId: run.runId, generations, scores };
|
|
7074
|
+
return { traceId: run.runId, generations, scores: scores2 };
|
|
7217
7075
|
}
|
|
7218
7076
|
async function toPrometheusText(store) {
|
|
7219
7077
|
const runs = await store.listRuns();
|
|
@@ -7314,12 +7172,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7314
7172
|
variantScores.push({ mutator: id, score, mutated });
|
|
7315
7173
|
all.push(score);
|
|
7316
7174
|
}
|
|
7317
|
-
const
|
|
7318
|
-
const variance = all.reduce((a, v) => a + (v -
|
|
7175
|
+
const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7176
|
+
const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
|
|
7319
7177
|
const stdDev = Math.sqrt(variance);
|
|
7320
|
-
const ref = Math.abs(
|
|
7178
|
+
const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
7321
7179
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7322
|
-
return { originalScore, variantScores, meanScore:
|
|
7180
|
+
return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
|
|
7323
7181
|
}
|
|
7324
7182
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7325
7183
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -7376,18 +7234,18 @@ async function paraphraseRobustnessScenarios(args) {
|
|
|
7376
7234
|
const deltas = {};
|
|
7377
7235
|
const paraphrasedAll = [];
|
|
7378
7236
|
for (const m of args.mutators) {
|
|
7379
|
-
const
|
|
7237
|
+
const scores2 = [];
|
|
7380
7238
|
for (let r = 0; r < reps; r++) {
|
|
7381
7239
|
const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
|
|
7382
7240
|
const out = await args.runScenario({
|
|
7383
7241
|
id: scenario.id,
|
|
7384
7242
|
userTurns: mutatedTurns
|
|
7385
7243
|
});
|
|
7386
|
-
|
|
7244
|
+
scores2.push(out.score);
|
|
7387
7245
|
}
|
|
7388
|
-
const
|
|
7389
|
-
deltas[m.name] =
|
|
7390
|
-
paraphrasedAll.push(...
|
|
7246
|
+
const mean5 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7247
|
+
deltas[m.name] = mean5 - originalScore;
|
|
7248
|
+
paraphrasedAll.push(...scores2);
|
|
7391
7249
|
}
|
|
7392
7250
|
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
7393
7251
|
perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
|
|
@@ -7802,10 +7660,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7802
7660
|
}
|
|
7803
7661
|
for (const s of scenarios) {
|
|
7804
7662
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7805
|
-
const
|
|
7806
|
-
if (
|
|
7807
|
-
const
|
|
7808
|
-
const variance =
|
|
7663
|
+
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7664
|
+
if (scores2.length < 3) continue;
|
|
7665
|
+
const mean5 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7666
|
+
const variance = scores2.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores2.length;
|
|
7809
7667
|
if (variance > varianceThreshold) {
|
|
7810
7668
|
targets.push({
|
|
7811
7669
|
reason: "high-variance",
|
|
@@ -7979,15 +7837,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7979
7837
|
const rejected = [];
|
|
7980
7838
|
const surviving = [];
|
|
7981
7839
|
for (const candidate of proposed) {
|
|
7982
|
-
const
|
|
7983
|
-
if (
|
|
7840
|
+
const scores2 = await scorer.scoreCandidate(candidate, targets);
|
|
7841
|
+
if (scores2.length < 2) {
|
|
7984
7842
|
rejected.push({ candidate, reason: "scorer returned <2 results" });
|
|
7985
7843
|
continue;
|
|
7986
7844
|
}
|
|
7987
|
-
const values =
|
|
7845
|
+
const values = scores2.map((s) => s.score);
|
|
7988
7846
|
const spread = Math.max(...values) - Math.min(...values);
|
|
7989
7847
|
const maxScore = Math.max(...values);
|
|
7990
|
-
scored.push({ candidate, scores, spread });
|
|
7848
|
+
scored.push({ candidate, scores: scores2, spread });
|
|
7991
7849
|
if (maxScore < floor) {
|
|
7992
7850
|
rejected.push({
|
|
7993
7851
|
candidate,
|
|
@@ -9138,15 +8996,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
|
|
|
9138
8996
|
const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
|
|
9139
8997
|
const matchStrategy = options.matchStrategy ?? "reference-order";
|
|
9140
8998
|
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9141
|
-
const
|
|
8999
|
+
const scores2 = scenarios.filter((scenario) => {
|
|
9142
9000
|
const split = scenario.split ?? "train";
|
|
9143
9001
|
if (split === "holdout" && !options.includeHoldout) return false;
|
|
9144
9002
|
return allowedSplits.has(split);
|
|
9145
9003
|
}).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
|
|
9146
9004
|
return {
|
|
9147
|
-
scenarios:
|
|
9148
|
-
aggregate: aggregateScenarioScores(
|
|
9149
|
-
bySplit: aggregateBySplit(
|
|
9005
|
+
scenarios: scores2,
|
|
9006
|
+
aggregate: aggregateScenarioScores(scores2),
|
|
9007
|
+
bySplit: aggregateBySplit(scores2)
|
|
9150
9008
|
};
|
|
9151
9009
|
}
|
|
9152
9010
|
function compareReferenceReplay(baseline, candidate) {
|
|
@@ -9369,20 +9227,20 @@ function buildScenarioScore(scenario, matches, falsePositives) {
|
|
|
9369
9227
|
matches
|
|
9370
9228
|
};
|
|
9371
9229
|
}
|
|
9372
|
-
function aggregateBySplit(
|
|
9230
|
+
function aggregateBySplit(scores2) {
|
|
9373
9231
|
const out = {};
|
|
9374
9232
|
for (const split of ALL_SPLITS) {
|
|
9375
|
-
const scoped =
|
|
9233
|
+
const scoped = scores2.filter((score) => score.split === split);
|
|
9376
9234
|
if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
|
|
9377
9235
|
}
|
|
9378
9236
|
return out;
|
|
9379
9237
|
}
|
|
9380
|
-
function aggregateScenarioScores(
|
|
9381
|
-
const matched = sum2(
|
|
9382
|
-
const total = sum2(
|
|
9383
|
-
const falsePositives = sum2(
|
|
9384
|
-
const matchedWeight = sum2(
|
|
9385
|
-
const totalWeight = sum2(
|
|
9238
|
+
function aggregateScenarioScores(scores2) {
|
|
9239
|
+
const matched = sum2(scores2.map((score) => score.matched));
|
|
9240
|
+
const total = sum2(scores2.map((score) => score.total));
|
|
9241
|
+
const falsePositives = sum2(scores2.map((score) => score.falsePositives));
|
|
9242
|
+
const matchedWeight = sum2(scores2.map((score) => score.matchedWeight));
|
|
9243
|
+
const totalWeight = sum2(scores2.map((score) => score.totalWeight));
|
|
9386
9244
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9387
9245
|
const recall = ratio(matched, total);
|
|
9388
9246
|
return {
|
|
@@ -9625,154 +9483,6 @@ function createDefaultReviewer(options) {
|
|
|
9625
9483
|
};
|
|
9626
9484
|
}
|
|
9627
9485
|
|
|
9628
|
-
// src/code-mutator.ts
|
|
9629
|
-
function createSandboxCodeMutator(opts) {
|
|
9630
|
-
const childIdFor = opts.childIdFor ?? ((parent, generation, index) => `${parent.id}.g${generation}.code.${index}`);
|
|
9631
|
-
const labelFor = opts.labelFor ?? ((outcome, parent, _generation, index) => outcome.description?.slice(0, 80) ?? `${parent.label} \u2192 code.${index}`);
|
|
9632
|
-
return {
|
|
9633
|
-
async mutate(args) {
|
|
9634
|
-
const { parent, parentAggregate, topTrials, bottomTrials, childCount, generation } = args;
|
|
9635
|
-
const startedAt = Date.now();
|
|
9636
|
-
const outcomes = await opts.pool.withSlot(async (slot) => {
|
|
9637
|
-
try {
|
|
9638
|
-
return await opts.runner({
|
|
9639
|
-
slot,
|
|
9640
|
-
parent,
|
|
9641
|
-
parentAggregate,
|
|
9642
|
-
topTrials,
|
|
9643
|
-
bottomTrials,
|
|
9644
|
-
childCount,
|
|
9645
|
-
generation
|
|
9646
|
-
});
|
|
9647
|
-
} catch (err) {
|
|
9648
|
-
return [
|
|
9649
|
-
{
|
|
9650
|
-
ok: false,
|
|
9651
|
-
failureReason: "runner_error",
|
|
9652
|
-
description: err instanceof Error ? err.message : String(err),
|
|
9653
|
-
latencyMs: Date.now() - startedAt
|
|
9654
|
-
}
|
|
9655
|
-
];
|
|
9656
|
-
}
|
|
9657
|
-
});
|
|
9658
|
-
const variants = [];
|
|
9659
|
-
let index = 0;
|
|
9660
|
-
for (const outcome of outcomes) {
|
|
9661
|
-
const childId = outcome.childId ?? childIdFor(parent, generation, index);
|
|
9662
|
-
if (opts.mutationTelemetry) {
|
|
9663
|
-
await opts.mutationTelemetry.record({
|
|
9664
|
-
ts: Date.now(),
|
|
9665
|
-
channel: "code",
|
|
9666
|
-
generation,
|
|
9667
|
-
parentId: parent.id,
|
|
9668
|
-
childId: outcome.ok ? childId : null,
|
|
9669
|
-
ok: outcome.ok,
|
|
9670
|
-
failureReason: outcome.failureReason,
|
|
9671
|
-
description: outcome.description,
|
|
9672
|
-
latencyMs: outcome.latencyMs,
|
|
9673
|
-
diffBytes: outcome.diffBytes,
|
|
9674
|
-
filesTouched: outcome.filesTouched,
|
|
9675
|
-
agentSteps: outcome.agentSteps,
|
|
9676
|
-
costUsd: outcome.costUsd
|
|
9677
|
-
});
|
|
9678
|
-
}
|
|
9679
|
-
if (opts.costLedger && outcome.costUsd !== void 0) {
|
|
9680
|
-
await opts.costLedger.addMutation("code", outcome.costUsd, { generation });
|
|
9681
|
-
}
|
|
9682
|
-
if (outcome.ok) {
|
|
9683
|
-
const variant = {
|
|
9684
|
-
id: childId,
|
|
9685
|
-
payload: opts.toVariantPayload(outcome, parent),
|
|
9686
|
-
generation,
|
|
9687
|
-
parentId: parent.id,
|
|
9688
|
-
label: labelFor(outcome, parent, generation, index),
|
|
9689
|
-
...outcome.rationale ? { rationale: outcome.rationale } : {}
|
|
9690
|
-
};
|
|
9691
|
-
variants.push(variant);
|
|
9692
|
-
if (opts.lineage) {
|
|
9693
|
-
await opts.lineage.upsert({
|
|
9694
|
-
id: variant.id,
|
|
9695
|
-
parentId: variant.parentId ?? null,
|
|
9696
|
-
generation: variant.generation,
|
|
9697
|
-
kind: "code",
|
|
9698
|
-
...variant.rationale ? { rationale: variant.rationale } : {}
|
|
9699
|
-
});
|
|
9700
|
-
}
|
|
9701
|
-
}
|
|
9702
|
-
index++;
|
|
9703
|
-
}
|
|
9704
|
-
if (opts.costLedger) {
|
|
9705
|
-
const u = opts.pool.utilization();
|
|
9706
|
-
await opts.costLedger.setPoolUtilization(u.busyMs, u.totalMs);
|
|
9707
|
-
}
|
|
9708
|
-
return variants;
|
|
9709
|
-
}
|
|
9710
|
-
};
|
|
9711
|
-
}
|
|
9712
|
-
|
|
9713
|
-
// src/composite-mutator.ts
|
|
9714
|
-
function createCompositeMutator(opts) {
|
|
9715
|
-
const recentScores = [];
|
|
9716
|
-
const plateauThreshold = opts.plateauThreshold ?? 0.02;
|
|
9717
|
-
const plateauPatience = opts.plateauPatience ?? 2;
|
|
9718
|
-
function pickMode(args) {
|
|
9719
|
-
recentScores.push(args.parentAggregate.meanScore);
|
|
9720
|
-
switch (opts.policy) {
|
|
9721
|
-
case "primary-only":
|
|
9722
|
-
return { mode: "primary", reason: "policy=primary-only" };
|
|
9723
|
-
case "secondary-only":
|
|
9724
|
-
if (!opts.secondary)
|
|
9725
|
-
return {
|
|
9726
|
-
mode: "primary",
|
|
9727
|
-
reason: "secondary-only requested but no secondary mutator wired"
|
|
9728
|
-
};
|
|
9729
|
-
return { mode: "secondary", reason: "policy=secondary-only" };
|
|
9730
|
-
case "alternate":
|
|
9731
|
-
if (!opts.secondary)
|
|
9732
|
-
return { mode: "primary", reason: "alternate requested but no secondary mutator wired" };
|
|
9733
|
-
return args.generation % 2 === 1 ? { mode: "secondary", reason: `alternate: gen${args.generation} odd \u2192 secondary` } : { mode: "primary", reason: `alternate: gen${args.generation} even \u2192 primary` };
|
|
9734
|
-
case "plateau": {
|
|
9735
|
-
if (!opts.secondary)
|
|
9736
|
-
return { mode: "primary", reason: "plateau requested but no secondary mutator wired" };
|
|
9737
|
-
if (recentScores.length <= plateauPatience) {
|
|
9738
|
-
return { mode: "primary", reason: "plateau: warming up with primary mutations" };
|
|
9739
|
-
}
|
|
9740
|
-
const window = recentScores.slice(-plateauPatience - 1);
|
|
9741
|
-
const deltas = window.slice(1).map((v, i) => v - window[i]);
|
|
9742
|
-
const stagnant = deltas.every((d) => d < plateauThreshold);
|
|
9743
|
-
if (stagnant) {
|
|
9744
|
-
return {
|
|
9745
|
-
mode: "split",
|
|
9746
|
-
reason: `plateau detected (${deltas.map((d) => d.toFixed(3)).join(", ")}) \u2192 split`
|
|
9747
|
-
};
|
|
9748
|
-
}
|
|
9749
|
-
return {
|
|
9750
|
-
mode: "primary",
|
|
9751
|
-
reason: `plateau: still improving (${deltas[deltas.length - 1].toFixed(3)})`
|
|
9752
|
-
};
|
|
9753
|
-
}
|
|
9754
|
-
}
|
|
9755
|
-
}
|
|
9756
|
-
return {
|
|
9757
|
-
async mutate(args) {
|
|
9758
|
-
const { mode, reason } = pickMode(args);
|
|
9759
|
-
opts.onPolicyDecision?.({ generation: args.generation, chose: mode, reason });
|
|
9760
|
-
if (mode === "primary") return opts.primary.mutate(args);
|
|
9761
|
-
if (mode === "secondary" && opts.secondary) return opts.secondary.mutate(args);
|
|
9762
|
-
if (mode === "split" && opts.secondary) {
|
|
9763
|
-
const secondaryShare = Math.ceil(args.childCount / 2);
|
|
9764
|
-
const primaryShare = args.childCount - secondaryShare;
|
|
9765
|
-
const [primaryChildren, secondaryChildren] = await Promise.all([
|
|
9766
|
-
opts.primary.mutate({ ...args, childCount: primaryShare }),
|
|
9767
|
-
opts.secondary.mutate({ ...args, childCount: secondaryShare })
|
|
9768
|
-
]);
|
|
9769
|
-
return [...primaryChildren, ...secondaryChildren];
|
|
9770
|
-
}
|
|
9771
|
-
return opts.primary.mutate(args);
|
|
9772
|
-
}
|
|
9773
|
-
};
|
|
9774
|
-
}
|
|
9775
|
-
|
|
9776
9486
|
// src/discover-personas.ts
|
|
9777
9487
|
import { promises as fs } from "fs";
|
|
9778
9488
|
import { basename, extname, join as join3 } from "path";
|
|
@@ -9819,238 +9529,6 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9819
9529
|
return results;
|
|
9820
9530
|
}
|
|
9821
9531
|
|
|
9822
|
-
// src/evolution-telemetry.ts
|
|
9823
|
-
import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
9824
|
-
import { dirname as dirname4 } from "path";
|
|
9825
|
-
var MutationTelemetry = class {
|
|
9826
|
-
appender;
|
|
9827
|
-
constructor(path) {
|
|
9828
|
-
this.appender = new LockedJsonlAppender(path);
|
|
9829
|
-
}
|
|
9830
|
-
async record(attempt) {
|
|
9831
|
-
await this.appender.append(attempt);
|
|
9832
|
-
}
|
|
9833
|
-
};
|
|
9834
|
-
var TrialTelemetry = class {
|
|
9835
|
-
appender;
|
|
9836
|
-
constructor(path) {
|
|
9837
|
-
this.appender = new LockedJsonlAppender(path);
|
|
9838
|
-
}
|
|
9839
|
-
async record(attempt) {
|
|
9840
|
-
await this.appender.append(attempt);
|
|
9841
|
-
}
|
|
9842
|
-
};
|
|
9843
|
-
var LineageRecorder = class {
|
|
9844
|
-
path;
|
|
9845
|
-
snapshotPath;
|
|
9846
|
-
mutex = new Mutex();
|
|
9847
|
-
nodes = /* @__PURE__ */ new Map();
|
|
9848
|
-
kindOf;
|
|
9849
|
-
constructor(path, kindOf) {
|
|
9850
|
-
this.path = path;
|
|
9851
|
-
this.snapshotPath = `${path}.snapshot`;
|
|
9852
|
-
this.kindOf = kindOf ?? defaultKindOf;
|
|
9853
|
-
mkdirSync4(dirname4(path), { recursive: true });
|
|
9854
|
-
if (existsSync7(this.snapshotPath)) {
|
|
9855
|
-
try {
|
|
9856
|
-
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
9857
|
-
for (const n of parsed) this.nodes.set(n.id, n);
|
|
9858
|
-
} catch {
|
|
9859
|
-
}
|
|
9860
|
-
}
|
|
9861
|
-
if (existsSync7(path)) {
|
|
9862
|
-
try {
|
|
9863
|
-
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
9864
|
-
if (!line.trim()) continue;
|
|
9865
|
-
try {
|
|
9866
|
-
const entry = JSON.parse(line);
|
|
9867
|
-
const prev = this.nodes.get(entry.id);
|
|
9868
|
-
this.nodes.set(entry.id, { ...prev, ...entry });
|
|
9869
|
-
} catch {
|
|
9870
|
-
}
|
|
9871
|
-
}
|
|
9872
|
-
} catch {
|
|
9873
|
-
}
|
|
9874
|
-
}
|
|
9875
|
-
if (existsSync7(path) && this.nodes.size === 0) {
|
|
9876
|
-
try {
|
|
9877
|
-
const raw = readFileSync6(path, "utf-8").trim();
|
|
9878
|
-
if (raw.startsWith("[")) {
|
|
9879
|
-
const parsed = JSON.parse(raw);
|
|
9880
|
-
for (const n of parsed) this.nodes.set(n.id, n);
|
|
9881
|
-
}
|
|
9882
|
-
} catch {
|
|
9883
|
-
}
|
|
9884
|
-
}
|
|
9885
|
-
}
|
|
9886
|
-
async upsert(node) {
|
|
9887
|
-
await this.mutex.runExclusive(() => {
|
|
9888
|
-
const prev = this.nodes.get(node.id);
|
|
9889
|
-
this.nodes.set(node.id, { ...prev, ...node });
|
|
9890
|
-
try {
|
|
9891
|
-
if (existsSync7(this.path)) {
|
|
9892
|
-
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
9893
|
-
if (head === "[") {
|
|
9894
|
-
writeFileSync(this.path, "");
|
|
9895
|
-
}
|
|
9896
|
-
}
|
|
9897
|
-
} catch {
|
|
9898
|
-
}
|
|
9899
|
-
appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
9900
|
-
`);
|
|
9901
|
-
});
|
|
9902
|
-
}
|
|
9903
|
-
async upsertVariant(variant, opts = {}) {
|
|
9904
|
-
await this.upsert({
|
|
9905
|
-
id: variant.id,
|
|
9906
|
-
parentId: variant.parentId ?? null,
|
|
9907
|
-
generation: variant.generation,
|
|
9908
|
-
kind: this.kindOf(variant),
|
|
9909
|
-
...variant.rationale ? { rationale: variant.rationale } : {},
|
|
9910
|
-
...opts.omitPayload || variant.payload === void 0 ? {} : { payload: variant.payload }
|
|
9911
|
-
});
|
|
9912
|
-
}
|
|
9913
|
-
snapshot() {
|
|
9914
|
-
return [...this.nodes.values()];
|
|
9915
|
-
}
|
|
9916
|
-
/**
|
|
9917
|
-
* Write the current consolidated state to `<path>.snapshot` so external
|
|
9918
|
-
* tools can read it without replaying the event log. Idempotent.
|
|
9919
|
-
*/
|
|
9920
|
-
async compact() {
|
|
9921
|
-
await this.mutex.runExclusive(() => {
|
|
9922
|
-
writeFileSync(this.snapshotPath, JSON.stringify([...this.nodes.values()], null, 2));
|
|
9923
|
-
});
|
|
9924
|
-
}
|
|
9925
|
-
};
|
|
9926
|
-
function defaultKindOf(variant) {
|
|
9927
|
-
if (variant.parentId === void 0) return "seed";
|
|
9928
|
-
const payload = variant.payload;
|
|
9929
|
-
if (payload && typeof payload === "object" && payload.codeMutation) return "code";
|
|
9930
|
-
return "prompt";
|
|
9931
|
-
}
|
|
9932
|
-
function emptyGenBucket() {
|
|
9933
|
-
return {
|
|
9934
|
-
mutatorPromptUsd: 0,
|
|
9935
|
-
mutatorCodeUsd: 0,
|
|
9936
|
-
scorerPromptUsd: 0,
|
|
9937
|
-
scorerCodeUsd: 0,
|
|
9938
|
-
trialsCounted: 0,
|
|
9939
|
-
cachedTrials: 0
|
|
9940
|
-
};
|
|
9941
|
-
}
|
|
9942
|
-
var CostLedger = class {
|
|
9943
|
-
totals = {
|
|
9944
|
-
mutatorPromptUsd: 0,
|
|
9945
|
-
mutatorCodeUsd: 0,
|
|
9946
|
-
scorerPromptUsd: 0,
|
|
9947
|
-
scorerCodeUsd: 0,
|
|
9948
|
-
trialsCounted: 0,
|
|
9949
|
-
cachedTrials: 0,
|
|
9950
|
-
poolBusyMs: 0,
|
|
9951
|
-
poolUtilizationPct: 0,
|
|
9952
|
-
byGeneration: {}
|
|
9953
|
-
};
|
|
9954
|
-
path;
|
|
9955
|
-
mutex = new Mutex();
|
|
9956
|
-
constructor(path) {
|
|
9957
|
-
this.path = path;
|
|
9958
|
-
if (existsSync7(path)) {
|
|
9959
|
-
try {
|
|
9960
|
-
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
9961
|
-
for (const k of Object.keys(this.totals)) {
|
|
9962
|
-
if (k === "byGeneration") {
|
|
9963
|
-
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
9964
|
-
this.totals.byGeneration = loaded.byGeneration;
|
|
9965
|
-
}
|
|
9966
|
-
continue;
|
|
9967
|
-
}
|
|
9968
|
-
const v = loaded[k];
|
|
9969
|
-
if (typeof v === "number" && Number.isFinite(v)) {
|
|
9970
|
-
;
|
|
9971
|
-
this.totals[k] = v;
|
|
9972
|
-
}
|
|
9973
|
-
}
|
|
9974
|
-
} catch {
|
|
9975
|
-
}
|
|
9976
|
-
} else {
|
|
9977
|
-
mkdirSync4(dirname4(path), { recursive: true });
|
|
9978
|
-
}
|
|
9979
|
-
}
|
|
9980
|
-
genBucket(generation) {
|
|
9981
|
-
if (generation === void 0) return null;
|
|
9982
|
-
const key = String(generation);
|
|
9983
|
-
if (!this.totals.byGeneration[key]) {
|
|
9984
|
-
this.totals.byGeneration[key] = emptyGenBucket();
|
|
9985
|
-
}
|
|
9986
|
-
return this.totals.byGeneration[key];
|
|
9987
|
-
}
|
|
9988
|
-
async addMutation(channel, usd, opts = {}) {
|
|
9989
|
-
await this.mutex.runExclusive(() => {
|
|
9990
|
-
const bucket = this.genBucket(opts.generation);
|
|
9991
|
-
if (channel === "prompt") {
|
|
9992
|
-
this.totals.mutatorPromptUsd += usd;
|
|
9993
|
-
if (bucket) bucket.mutatorPromptUsd += usd;
|
|
9994
|
-
} else {
|
|
9995
|
-
this.totals.mutatorCodeUsd += usd;
|
|
9996
|
-
if (bucket) bucket.mutatorCodeUsd += usd;
|
|
9997
|
-
}
|
|
9998
|
-
this.persist();
|
|
9999
|
-
});
|
|
10000
|
-
}
|
|
10001
|
-
async addTrial(channel, usd, cached, opts = {}) {
|
|
10002
|
-
await this.mutex.runExclusive(() => {
|
|
10003
|
-
const bucket = this.genBucket(opts.generation);
|
|
10004
|
-
if (cached) {
|
|
10005
|
-
this.totals.cachedTrials++;
|
|
10006
|
-
this.totals.trialsCounted++;
|
|
10007
|
-
if (bucket) {
|
|
10008
|
-
bucket.cachedTrials++;
|
|
10009
|
-
bucket.trialsCounted++;
|
|
10010
|
-
}
|
|
10011
|
-
this.persist();
|
|
10012
|
-
return;
|
|
10013
|
-
}
|
|
10014
|
-
if (channel === "prompt") {
|
|
10015
|
-
this.totals.scorerPromptUsd += usd;
|
|
10016
|
-
if (bucket) bucket.scorerPromptUsd += usd;
|
|
10017
|
-
} else {
|
|
10018
|
-
this.totals.scorerCodeUsd += usd;
|
|
10019
|
-
if (bucket) bucket.scorerCodeUsd += usd;
|
|
10020
|
-
}
|
|
10021
|
-
this.totals.trialsCounted++;
|
|
10022
|
-
if (bucket) bucket.trialsCounted++;
|
|
10023
|
-
this.persist();
|
|
10024
|
-
});
|
|
10025
|
-
}
|
|
10026
|
-
async setPoolUtilization(busyMs, totalMs) {
|
|
10027
|
-
await this.mutex.runExclusive(() => {
|
|
10028
|
-
this.totals.poolBusyMs = busyMs;
|
|
10029
|
-
this.totals.poolUtilizationPct = totalMs > 0 ? 100 * busyMs / totalMs : 0;
|
|
10030
|
-
this.persist();
|
|
10031
|
-
});
|
|
10032
|
-
}
|
|
10033
|
-
snapshot() {
|
|
10034
|
-
const totalUsd = this.totals.mutatorPromptUsd + this.totals.mutatorCodeUsd + this.totals.scorerPromptUsd + this.totals.scorerCodeUsd;
|
|
10035
|
-
const byGeneration = Object.entries(this.totals.byGeneration).map(([g, b]) => ({ generation: Number(g), ...b })).sort((a, b) => a.generation - b.generation);
|
|
10036
|
-
return {
|
|
10037
|
-
totalUsd,
|
|
10038
|
-
mutatorPromptUsd: this.totals.mutatorPromptUsd,
|
|
10039
|
-
mutatorCodeUsd: this.totals.mutatorCodeUsd,
|
|
10040
|
-
scorerPromptUsd: this.totals.scorerPromptUsd,
|
|
10041
|
-
scorerCodeUsd: this.totals.scorerCodeUsd,
|
|
10042
|
-
trialsCounted: this.totals.trialsCounted,
|
|
10043
|
-
cachedTrials: this.totals.cachedTrials,
|
|
10044
|
-
poolBusyMs: this.totals.poolBusyMs,
|
|
10045
|
-
poolUtilizationPct: this.totals.poolUtilizationPct,
|
|
10046
|
-
byGeneration
|
|
10047
|
-
};
|
|
10048
|
-
}
|
|
10049
|
-
persist() {
|
|
10050
|
-
writeFileSync(this.path, JSON.stringify(this.totals, null, 2));
|
|
10051
|
-
}
|
|
10052
|
-
};
|
|
10053
|
-
|
|
10054
9532
|
// src/golden-matcher.ts
|
|
10055
9533
|
function matchGoldens(goldens, candidates, options = {}) {
|
|
10056
9534
|
const extract = options.text ?? defaultExtract2;
|
|
@@ -10125,52 +9603,164 @@ function precision(goldens, candidates, options = {}) {
|
|
|
10125
9603
|
return matched / candidates.length;
|
|
10126
9604
|
}
|
|
10127
9605
|
|
|
10128
|
-
// src/
|
|
10129
|
-
|
|
10130
|
-
|
|
10131
|
-
|
|
10132
|
-
|
|
10133
|
-
|
|
10134
|
-
|
|
10135
|
-
|
|
10136
|
-
|
|
10137
|
-
|
|
10138
|
-
|
|
10139
|
-
|
|
10140
|
-
|
|
10141
|
-
|
|
10142
|
-
|
|
10143
|
-
|
|
10144
|
-
|
|
10145
|
-
|
|
10146
|
-
|
|
10147
|
-
|
|
9606
|
+
// src/held-out-gate.ts
|
|
9607
|
+
var HeldOutGate = class {
|
|
9608
|
+
minProductiveRuns;
|
|
9609
|
+
pairedDeltaThreshold;
|
|
9610
|
+
overfitGapThreshold;
|
|
9611
|
+
baselineKey;
|
|
9612
|
+
confidence;
|
|
9613
|
+
resamples;
|
|
9614
|
+
seed;
|
|
9615
|
+
constructor(config) {
|
|
9616
|
+
if (!config.baselineKey) {
|
|
9617
|
+
throw new Error("HeldOutGate: baselineKey is required");
|
|
9618
|
+
}
|
|
9619
|
+
this.minProductiveRuns = config.minProductiveRuns ?? 3;
|
|
9620
|
+
this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
|
|
9621
|
+
this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
|
|
9622
|
+
this.baselineKey = config.baselineKey;
|
|
9623
|
+
this.confidence = config.confidence ?? 0.95;
|
|
9624
|
+
this.resamples = config.bootstrapResamples ?? 2e3;
|
|
9625
|
+
this.seed = config.seed;
|
|
9626
|
+
}
|
|
9627
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
9628
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
9629
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
9630
|
+
* a holdout score on both sides are dropped. */
|
|
9631
|
+
evaluate(candidate, baseline) {
|
|
9632
|
+
const candidateId = inferCandidateId(candidate, this.baselineKey);
|
|
9633
|
+
const baselineId = this.baselineKey;
|
|
9634
|
+
const baselineHoldoutByKey = indexHoldoutByKey(baseline);
|
|
9635
|
+
const beforeHoldout = [];
|
|
9636
|
+
const afterHoldout = [];
|
|
9637
|
+
for (const run of candidate) {
|
|
9638
|
+
if (run.splitTag !== "holdout") continue;
|
|
9639
|
+
if (run.outcome.holdoutScore === void 0) continue;
|
|
9640
|
+
const key = pairKey(run);
|
|
9641
|
+
const counterpart = baselineHoldoutByKey.get(key);
|
|
9642
|
+
if (counterpart === void 0) continue;
|
|
9643
|
+
beforeHoldout.push(counterpart);
|
|
9644
|
+
afterHoldout.push(run.outcome.holdoutScore);
|
|
9645
|
+
}
|
|
9646
|
+
const productiveRuns = beforeHoldout.length;
|
|
9647
|
+
const candidateSearchMean = mean4(scores(candidate, "searchScore", "search"));
|
|
9648
|
+
const candidateHoldoutMean = mean4(scores(candidate, "holdoutScore", "holdout"));
|
|
9649
|
+
const baselineSearchMean = mean4(scores(baseline, "searchScore", "search"));
|
|
9650
|
+
const baselineHoldoutMean = mean4(scores(baseline, "holdoutScore", "holdout"));
|
|
9651
|
+
const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
|
|
9652
|
+
const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
|
|
9653
|
+
if (productiveRuns < this.minProductiveRuns) {
|
|
9654
|
+
return {
|
|
9655
|
+
promote: false,
|
|
9656
|
+
candidateId,
|
|
9657
|
+
baselineId,
|
|
9658
|
+
evidence: {
|
|
9659
|
+
productiveRuns,
|
|
9660
|
+
medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
|
|
9661
|
+
pairedCI: { low: 0, high: 0 },
|
|
9662
|
+
pairedPValue: 1,
|
|
9663
|
+
searchScore: candidateSearchMean,
|
|
9664
|
+
holdoutScore: candidateHoldoutMean,
|
|
9665
|
+
overfitGap,
|
|
9666
|
+
baselineOverfitGap
|
|
9667
|
+
},
|
|
9668
|
+
reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
|
|
9669
|
+
rejectionCode: "few_runs"
|
|
9670
|
+
};
|
|
10148
9671
|
}
|
|
10149
|
-
|
|
10150
|
-
|
|
10151
|
-
|
|
10152
|
-
|
|
9672
|
+
const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
|
|
9673
|
+
confidence: this.confidence,
|
|
9674
|
+
resamples: this.resamples,
|
|
9675
|
+
statistic: "median",
|
|
9676
|
+
seed: this.seed
|
|
9677
|
+
});
|
|
9678
|
+
const wilcoxon = wilcoxonSignedRank(beforeHoldout, afterHoldout);
|
|
9679
|
+
const evidence = {
|
|
9680
|
+
productiveRuns,
|
|
9681
|
+
medianPairedDelta: ci.median,
|
|
9682
|
+
pairedCI: { low: ci.low, high: ci.high },
|
|
9683
|
+
pairedPValue: wilcoxon.p,
|
|
9684
|
+
searchScore: candidateSearchMean,
|
|
9685
|
+
holdoutScore: candidateHoldoutMean,
|
|
9686
|
+
overfitGap,
|
|
9687
|
+
baselineOverfitGap
|
|
9688
|
+
};
|
|
9689
|
+
if (!(ci.low > this.pairedDeltaThreshold)) {
|
|
9690
|
+
return {
|
|
9691
|
+
promote: false,
|
|
9692
|
+
candidateId,
|
|
9693
|
+
baselineId,
|
|
9694
|
+
evidence,
|
|
9695
|
+
reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
|
|
9696
|
+
rejectionCode: "negative_delta"
|
|
9697
|
+
};
|
|
9698
|
+
}
|
|
9699
|
+
if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
|
|
9700
|
+
return {
|
|
9701
|
+
promote: false,
|
|
9702
|
+
candidateId,
|
|
9703
|
+
baselineId,
|
|
9704
|
+
evidence,
|
|
9705
|
+
reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
|
|
9706
|
+
rejectionCode: "overfit_gap"
|
|
9707
|
+
};
|
|
9708
|
+
}
|
|
9709
|
+
return {
|
|
9710
|
+
promote: true,
|
|
9711
|
+
candidateId,
|
|
9712
|
+
baselineId,
|
|
9713
|
+
evidence,
|
|
9714
|
+
reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
|
|
9715
|
+
rejectionCode: null
|
|
9716
|
+
};
|
|
10153
9717
|
}
|
|
10154
|
-
|
|
10155
|
-
|
|
10156
|
-
|
|
10157
|
-
|
|
9718
|
+
};
|
|
9719
|
+
function inferCandidateId(candidate, baselineKey) {
|
|
9720
|
+
for (const run of candidate) {
|
|
9721
|
+
if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
|
|
10158
9722
|
}
|
|
10159
|
-
|
|
10160
|
-
|
|
9723
|
+
return candidate[0]?.candidateId ?? "(unknown candidate)";
|
|
9724
|
+
}
|
|
9725
|
+
function indexHoldoutByKey(runs) {
|
|
9726
|
+
const out = /* @__PURE__ */ new Map();
|
|
9727
|
+
for (const r of runs) {
|
|
9728
|
+
if (r.splitTag !== "holdout") continue;
|
|
9729
|
+
if (r.outcome.holdoutScore === void 0) continue;
|
|
9730
|
+
out.set(pairKey(r), r.outcome.holdoutScore);
|
|
10161
9731
|
}
|
|
10162
|
-
|
|
10163
|
-
|
|
10164
|
-
|
|
10165
|
-
|
|
10166
|
-
|
|
10167
|
-
|
|
10168
|
-
|
|
10169
|
-
|
|
10170
|
-
|
|
10171
|
-
|
|
9732
|
+
return out;
|
|
9733
|
+
}
|
|
9734
|
+
function pairKey(r) {
|
|
9735
|
+
return `${r.experimentId}::${r.seed}`;
|
|
9736
|
+
}
|
|
9737
|
+
function scores(runs, field, splitFilter) {
|
|
9738
|
+
const out = [];
|
|
9739
|
+
for (const r of runs) {
|
|
9740
|
+
if (r.splitTag !== splitFilter) continue;
|
|
9741
|
+
const v = r.outcome[field];
|
|
9742
|
+
if (typeof v === "number" && Number.isFinite(v)) out.push(v);
|
|
10172
9743
|
}
|
|
10173
|
-
|
|
9744
|
+
return out;
|
|
9745
|
+
}
|
|
9746
|
+
function mean4(xs) {
|
|
9747
|
+
if (xs.length === 0) return Number.NaN;
|
|
9748
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
9749
|
+
}
|
|
9750
|
+
function safeDiff(a, b) {
|
|
9751
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
9752
|
+
return a - b;
|
|
9753
|
+
}
|
|
9754
|
+
function medianDelta(before, after) {
|
|
9755
|
+
const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
|
|
9756
|
+
if (ds.length === 0) return 0;
|
|
9757
|
+
const mid = Math.floor(ds.length / 2);
|
|
9758
|
+
return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
|
|
9759
|
+
}
|
|
9760
|
+
function fmt(x) {
|
|
9761
|
+
if (!Number.isFinite(x)) return String(x);
|
|
9762
|
+
return x.toFixed(4);
|
|
9763
|
+
}
|
|
10174
9764
|
|
|
10175
9765
|
// src/judge-retry.ts
|
|
10176
9766
|
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
@@ -10250,9 +9840,9 @@ function passOrthogonality(input) {
|
|
|
10250
9840
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10251
9841
|
}
|
|
10252
9842
|
}
|
|
10253
|
-
const
|
|
9843
|
+
const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10254
9844
|
return {
|
|
10255
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
9845
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
|
|
10256
9846
|
passCount: passes.length,
|
|
10257
9847
|
similarities: sims
|
|
10258
9848
|
};
|
|
@@ -10351,6 +9941,44 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
|
10351
9941
|
};
|
|
10352
9942
|
}
|
|
10353
9943
|
|
|
9944
|
+
// src/researcher.ts
|
|
9945
|
+
var CallbackResearcher = class {
|
|
9946
|
+
constructor(callbacks) {
|
|
9947
|
+
this.callbacks = callbacks;
|
|
9948
|
+
}
|
|
9949
|
+
callbacks;
|
|
9950
|
+
inspectFailures(runs) {
|
|
9951
|
+
return this.callbacks.inspectFailures(runs);
|
|
9952
|
+
}
|
|
9953
|
+
proposeChange(failures) {
|
|
9954
|
+
return this.callbacks.proposeChange(failures);
|
|
9955
|
+
}
|
|
9956
|
+
applyChange(changes, baseline) {
|
|
9957
|
+
return this.callbacks.applyChange(changes, baseline);
|
|
9958
|
+
}
|
|
9959
|
+
evaluateChange(plan) {
|
|
9960
|
+
return this.callbacks.evaluateChange(plan);
|
|
9961
|
+
}
|
|
9962
|
+
};
|
|
9963
|
+
var NoopResearcher = class {
|
|
9964
|
+
hint;
|
|
9965
|
+
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
9966
|
+
this.hint = hint;
|
|
9967
|
+
}
|
|
9968
|
+
async inspectFailures(_runs) {
|
|
9969
|
+
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
9970
|
+
}
|
|
9971
|
+
async proposeChange(_failures) {
|
|
9972
|
+
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
9973
|
+
}
|
|
9974
|
+
async applyChange(_changes, _baseline) {
|
|
9975
|
+
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
9976
|
+
}
|
|
9977
|
+
async evaluateChange(_plan) {
|
|
9978
|
+
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
9979
|
+
}
|
|
9980
|
+
};
|
|
9981
|
+
|
|
10354
9982
|
// src/sandbox-pool.ts
|
|
10355
9983
|
function createSandboxPool(opts) {
|
|
10356
9984
|
if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`);
|
|
@@ -10453,55 +10081,6 @@ function createSandboxPool(opts) {
|
|
|
10453
10081
|
};
|
|
10454
10082
|
}
|
|
10455
10083
|
|
|
10456
|
-
// src/trial-aggregator.ts
|
|
10457
|
-
function meanOf(xs) {
|
|
10458
|
-
if (xs.length === 0) return 0;
|
|
10459
|
-
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10460
|
-
}
|
|
10461
|
-
function meanMetrics(rows) {
|
|
10462
|
-
if (rows.length === 0) return {};
|
|
10463
|
-
const keys = /* @__PURE__ */ new Set();
|
|
10464
|
-
for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
|
|
10465
|
-
const out = {};
|
|
10466
|
-
for (const k of keys) {
|
|
10467
|
-
const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
|
|
10468
|
-
if (xs.length > 0) out[k] = meanOf(xs);
|
|
10469
|
-
}
|
|
10470
|
-
return out;
|
|
10471
|
-
}
|
|
10472
|
-
function aggregateTrialsByMode(trials, opts) {
|
|
10473
|
-
const gradedTrials = trials.filter((t) => !t.error);
|
|
10474
|
-
const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
|
|
10475
|
-
const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
|
|
10476
|
-
if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
|
|
10477
|
-
return {
|
|
10478
|
-
meanScore: 0,
|
|
10479
|
-
meanCost: 0,
|
|
10480
|
-
meanDurationMs: 0,
|
|
10481
|
-
okRate: 0,
|
|
10482
|
-
countedTrials: 0,
|
|
10483
|
-
excludedFailedTrials: judgeFailed.length,
|
|
10484
|
-
totalTrials: trials.length,
|
|
10485
|
-
metrics: {},
|
|
10486
|
-
strictFailure: {
|
|
10487
|
-
failedCount: judgeFailed.length,
|
|
10488
|
-
firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
|
|
10489
|
-
}
|
|
10490
|
-
};
|
|
10491
|
-
}
|
|
10492
|
-
const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
|
|
10493
|
-
return {
|
|
10494
|
-
meanScore: meanOf(counted.map((t) => t.score)),
|
|
10495
|
-
meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
|
|
10496
|
-
meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
|
|
10497
|
-
okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
|
|
10498
|
-
countedTrials: counted.length,
|
|
10499
|
-
excludedFailedTrials: judgeFailed.length,
|
|
10500
|
-
totalTrials: trials.length,
|
|
10501
|
-
metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
|
|
10502
|
-
};
|
|
10503
|
-
}
|
|
10504
|
-
|
|
10505
10084
|
// src/otel-pipeline.ts
|
|
10506
10085
|
function withOtelPipeline(opts) {
|
|
10507
10086
|
const config = {
|
|
@@ -10594,17 +10173,17 @@ function traceJudge(judge, judgeName, opts) {
|
|
|
10594
10173
|
}
|
|
10595
10174
|
});
|
|
10596
10175
|
try {
|
|
10597
|
-
const
|
|
10598
|
-
const composite =
|
|
10176
|
+
const scores2 = await judge(tc, input);
|
|
10177
|
+
const composite = scores2.length > 0 ? scores2.reduce((sum3, s) => sum3 + s.score, 0) / scores2.length : 0;
|
|
10599
10178
|
await span.end({
|
|
10600
10179
|
attributes: {
|
|
10601
10180
|
"judge.name": judgeName,
|
|
10602
10181
|
"judge.composite_score": composite,
|
|
10603
|
-
"judge.dimension_count":
|
|
10182
|
+
"judge.dimension_count": scores2.length,
|
|
10604
10183
|
"eval.phase": "judge"
|
|
10605
10184
|
}
|
|
10606
10185
|
});
|
|
10607
|
-
return
|
|
10186
|
+
return scores2;
|
|
10608
10187
|
} catch (err) {
|
|
10609
10188
|
await span.fail(err instanceof Error ? err : String(err));
|
|
10610
10189
|
throw err;
|
|
@@ -10631,8 +10210,8 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
|
|
|
10631
10210
|
emitter: opts.emitter,
|
|
10632
10211
|
parentSpanId: ensembleSpan.span.spanId
|
|
10633
10212
|
});
|
|
10634
|
-
const
|
|
10635
|
-
allScores.push(...
|
|
10213
|
+
const scores2 = await tracedFn(tc, input);
|
|
10214
|
+
allScores.push(...scores2);
|
|
10636
10215
|
}
|
|
10637
10216
|
const composite = allScores.length > 0 ? allScores.reduce((sum3, s) => sum3 + s.score, 0) / allScores.length : 0;
|
|
10638
10217
|
await ensembleSpan.end({
|
|
@@ -10650,48 +10229,6 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
|
|
|
10650
10229
|
}
|
|
10651
10230
|
};
|
|
10652
10231
|
}
|
|
10653
|
-
|
|
10654
|
-
// src/traced-mutator.ts
|
|
10655
|
-
function traceMutator(adapter, opts) {
|
|
10656
|
-
return {
|
|
10657
|
-
async mutate(args) {
|
|
10658
|
-
const span = await opts.emitter.span({
|
|
10659
|
-
kind: "llm",
|
|
10660
|
-
name: `mutator:gen-${args.generation}`,
|
|
10661
|
-
parentSpanId: opts.parentSpanId,
|
|
10662
|
-
attributes: {
|
|
10663
|
-
"mutator.parent_id": args.parent.id,
|
|
10664
|
-
"mutator.generation": args.generation,
|
|
10665
|
-
"mutator.child_count": args.childCount,
|
|
10666
|
-
"mutator.top_trials": args.topTrials.length,
|
|
10667
|
-
"mutator.bottom_trials": args.bottomTrials.length,
|
|
10668
|
-
"mutator.parent_score": args.parentAggregate.meanScore,
|
|
10669
|
-
"eval.phase": "mutator"
|
|
10670
|
-
}
|
|
10671
|
-
});
|
|
10672
|
-
try {
|
|
10673
|
-
const children = await adapter.mutate(args);
|
|
10674
|
-
await span.end({
|
|
10675
|
-
attributes: {
|
|
10676
|
-
"mutator.parent_id": args.parent.id,
|
|
10677
|
-
"mutator.generation": args.generation,
|
|
10678
|
-
"mutator.child_count": args.childCount,
|
|
10679
|
-
"mutator.top_trials": args.topTrials.length,
|
|
10680
|
-
"mutator.bottom_trials": args.bottomTrials.length,
|
|
10681
|
-
"mutator.parent_score": args.parentAggregate.meanScore,
|
|
10682
|
-
"mutator.produced_count": children.length,
|
|
10683
|
-
"mutator.child_ids": children.map((c) => c.id).join(","),
|
|
10684
|
-
"eval.phase": "mutator"
|
|
10685
|
-
}
|
|
10686
|
-
});
|
|
10687
|
-
return children;
|
|
10688
|
-
} catch (err) {
|
|
10689
|
-
await span.fail(err instanceof Error ? err : String(err));
|
|
10690
|
-
throw err;
|
|
10691
|
-
}
|
|
10692
|
-
}
|
|
10693
|
-
};
|
|
10694
|
-
}
|
|
10695
10232
|
export {
|
|
10696
10233
|
AGENT_PROFILE_KINDS,
|
|
10697
10234
|
ANALYST_SEVERITIES,
|
|
@@ -10709,7 +10246,6 @@ export {
|
|
|
10709
10246
|
CaptureIntegrityError,
|
|
10710
10247
|
ConfigError,
|
|
10711
10248
|
ConvergenceTracker,
|
|
10712
|
-
CostLedger,
|
|
10713
10249
|
CostTracker,
|
|
10714
10250
|
D1ExperimentStore,
|
|
10715
10251
|
DEFAULT_AGENT_SLOS,
|
|
@@ -10750,15 +10286,12 @@ export {
|
|
|
10750
10286
|
InMemoryFeedbackTrajectoryStore,
|
|
10751
10287
|
InMemoryRawProviderSink,
|
|
10752
10288
|
InMemoryTraceStore,
|
|
10753
|
-
InMemoryTrialCache,
|
|
10754
10289
|
InMemoryWorkspaceInspector,
|
|
10755
|
-
JsonlTrialCache,
|
|
10756
10290
|
JudgeError,
|
|
10757
10291
|
JudgeRunner,
|
|
10758
10292
|
KIND_EXPECTED_SUBJECTS,
|
|
10759
10293
|
KNOWLEDGE_GAP_KIND_SPEC,
|
|
10760
10294
|
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
10761
|
-
LineageRecorder,
|
|
10762
10295
|
LlmCallError,
|
|
10763
10296
|
LlmClient,
|
|
10764
10297
|
LlmRouteAssertionError,
|
|
@@ -10766,7 +10299,6 @@ export {
|
|
|
10766
10299
|
MODEL_PRICING,
|
|
10767
10300
|
MetricsCollector,
|
|
10768
10301
|
MultiLayerVerifier,
|
|
10769
|
-
MutationTelemetry,
|
|
10770
10302
|
Mutex,
|
|
10771
10303
|
NoopRawProviderSink,
|
|
10772
10304
|
NoopResearcher,
|
|
@@ -10800,7 +10332,6 @@ export {
|
|
|
10800
10332
|
TraceEmitter,
|
|
10801
10333
|
TraceFileMissingError,
|
|
10802
10334
|
TraceNotFoundError,
|
|
10803
|
-
TrialTelemetry,
|
|
10804
10335
|
UNIVERSAL_FINDERS,
|
|
10805
10336
|
ValidationError,
|
|
10806
10337
|
VerificationError,
|
|
@@ -10812,7 +10343,6 @@ export {
|
|
|
10812
10343
|
aggregateLlm,
|
|
10813
10344
|
aggregatePrReviewScore,
|
|
10814
10345
|
aggregateRunScore,
|
|
10815
|
-
aggregateTrialsByMode,
|
|
10816
10346
|
allCriticalPassed,
|
|
10817
10347
|
analyzeAntiSlop,
|
|
10818
10348
|
analyzeSeries,
|
|
@@ -10881,7 +10411,6 @@ export {
|
|
|
10881
10411
|
corpusInterRaterAgreementFromJudgeScores,
|
|
10882
10412
|
createAntiSlopJudge,
|
|
10883
10413
|
createChatClient,
|
|
10884
|
-
createCompositeMutator,
|
|
10885
10414
|
createCustomJudge,
|
|
10886
10415
|
createDefaultReviewer,
|
|
10887
10416
|
createDomainExpertJudge,
|
|
@@ -10894,7 +10423,6 @@ export {
|
|
|
10894
10423
|
createOtelTracingStore,
|
|
10895
10424
|
createReplayFetch,
|
|
10896
10425
|
createRunCriticAdapter,
|
|
10897
|
-
createSandboxCodeMutator,
|
|
10898
10426
|
createSandboxPool,
|
|
10899
10427
|
createSemanticConceptJudge,
|
|
10900
10428
|
createSemanticConceptJudgeAdapter,
|
|
@@ -10908,7 +10436,6 @@ export {
|
|
|
10908
10436
|
decideReferenceReplayRunPromotion,
|
|
10909
10437
|
defaultIsMaterial,
|
|
10910
10438
|
defaultJudges,
|
|
10911
|
-
defaultMultiShotObjectives,
|
|
10912
10439
|
defaultProviderRedactor,
|
|
10913
10440
|
defaultReferenceReplayMatcher,
|
|
10914
10441
|
defaultTraceInsightPanel,
|
|
@@ -10966,11 +10493,6 @@ export {
|
|
|
10966
10493
|
inMemoryReferenceReplayStore,
|
|
10967
10494
|
inMemoryReviewStore,
|
|
10968
10495
|
inferDomainKeywords,
|
|
10969
|
-
integrationAsi,
|
|
10970
|
-
integrationGateEvals,
|
|
10971
|
-
integrationInvokeFailedPayload,
|
|
10972
|
-
integrationManifestResolvedPayload,
|
|
10973
|
-
integrationManifestValidatedPayload,
|
|
10974
10496
|
interRaterReliability,
|
|
10975
10497
|
iqr,
|
|
10976
10498
|
isJudgeSpan,
|
|
@@ -11048,7 +10570,6 @@ export {
|
|
|
11048
10570
|
referenceReplayScenarioToRunScore,
|
|
11049
10571
|
regexMatch,
|
|
11050
10572
|
regexMatches,
|
|
11051
|
-
releaseTraceEvidenceFromMultiShotTrials,
|
|
11052
10573
|
renderFindingSubject,
|
|
11053
10574
|
renderMarkdown,
|
|
11054
10575
|
renderMarkdownReport,
|
|
@@ -11083,9 +10604,6 @@ export {
|
|
|
11083
10604
|
runKeywordCoverageJudge,
|
|
11084
10605
|
runKeywordCoverageJudgeUrl,
|
|
11085
10606
|
runLiveProof,
|
|
11086
|
-
runMultiShotOptimization,
|
|
11087
|
-
runProductionLoop,
|
|
11088
|
-
runPromptEvolution,
|
|
11089
10607
|
runProposeReview,
|
|
11090
10608
|
runProposeReviewAsControlLoop,
|
|
11091
10609
|
runReferenceReplay,
|
|
@@ -11134,9 +10652,7 @@ export {
|
|
|
11134
10652
|
traceAnalystOnRunComplete,
|
|
11135
10653
|
traceJudge,
|
|
11136
10654
|
traceJudgeEnsemble,
|
|
11137
|
-
traceMutator,
|
|
11138
10655
|
tracedAnalyzeTraces,
|
|
11139
|
-
trialTraceFromMultiShotTrial,
|
|
11140
10656
|
typoMutator,
|
|
11141
10657
|
urlContains,
|
|
11142
10658
|
userQuestionsForKnowledgeGaps,
|