@tangle-network/agent-eval 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/README.md +71 -0
- package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-VRJVTXRV.js → chunk-EDUKQ5AM.js} +85 -85
- package/dist/{chunk-VRJVTXRV.js.map → chunk-EDUKQ5AM.js.map} +1 -1
- package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.d.ts +311 -11
- package/dist/index.js +695 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/pipelines/index.js +3 -67
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{release-report-TDPn1cxq.d.ts → release-report-BNgMdqPF.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CUOiGcGv.d.ts → researcher-BPT8x_NT.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-C7VPYEj2.d.ts} +1 -1
- package/dist/wire/index.d.ts +347 -3
- package/dist/wire/index.js +19 -1
- package/package.json +1 -1
- package/dist/chunk-OHEPNJQN.js.map +0 -1
- package/dist/chunk-SY6WAAAD.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -8,9 +8,10 @@ import {
|
|
|
8
8
|
classifyFailure,
|
|
9
9
|
compareToBaseline,
|
|
10
10
|
computeToolUseMetrics,
|
|
11
|
+
failureClusterView,
|
|
11
12
|
iqr,
|
|
12
13
|
welchsTTest
|
|
13
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-JLZQWFV3.js";
|
|
14
15
|
import {
|
|
15
16
|
exportTrainingData,
|
|
16
17
|
toNdjson
|
|
@@ -95,7 +96,7 @@ import {
|
|
|
95
96
|
summarizePreferenceMemory,
|
|
96
97
|
trialTraceFromMultiShotTrial,
|
|
97
98
|
withAssignedFeedbackSplit
|
|
98
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-EDUKQ5AM.js";
|
|
99
100
|
import {
|
|
100
101
|
RunRecordValidationError,
|
|
101
102
|
isRunRecord,
|
|
@@ -220,6 +221,304 @@ import {
|
|
|
220
221
|
} from "./chunk-NG236HPC.js";
|
|
221
222
|
import "./chunk-PZ5AY32C.js";
|
|
222
223
|
|
|
224
|
+
// src/auto-pr.ts
|
|
225
|
+
async function proposeAutomatedPullRequest(client, input) {
|
|
226
|
+
validate(input);
|
|
227
|
+
return client.proposeChange(input);
|
|
228
|
+
}
|
|
229
|
+
function validate(input) {
|
|
230
|
+
if (!input.repo.owner.trim() || !input.repo.name.trim()) {
|
|
231
|
+
throw new ValidationError("proposeAutomatedPullRequest: repo.owner and repo.name required");
|
|
232
|
+
}
|
|
233
|
+
if (!input.branchName.trim() || /\s/.test(input.branchName)) {
|
|
234
|
+
throw new ValidationError(
|
|
235
|
+
"proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace"
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
if (input.branchName === (input.baseBranch ?? "main")) {
|
|
239
|
+
throw new ValidationError("proposeAutomatedPullRequest: branchName must differ from baseBranch");
|
|
240
|
+
}
|
|
241
|
+
if (input.fileChanges.length === 0) {
|
|
242
|
+
throw new ValidationError("proposeAutomatedPullRequest: fileChanges must not be empty");
|
|
243
|
+
}
|
|
244
|
+
const seenPaths = /* @__PURE__ */ new Set();
|
|
245
|
+
for (const change of input.fileChanges) {
|
|
246
|
+
if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
|
|
247
|
+
throw new ValidationError(
|
|
248
|
+
`proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
|
|
249
|
+
);
|
|
250
|
+
}
|
|
251
|
+
if (seenPaths.has(change.path)) {
|
|
252
|
+
throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
|
|
253
|
+
}
|
|
254
|
+
seenPaths.add(change.path);
|
|
255
|
+
}
|
|
256
|
+
if (!input.title.trim()) {
|
|
257
|
+
throw new ValidationError("proposeAutomatedPullRequest: title must not be empty");
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
function httpGithubClient(opts) {
|
|
261
|
+
const fetchImpl = opts.fetchImpl ?? fetch;
|
|
262
|
+
const apiBase = (opts.apiBase ?? "https://api.github.com").replace(/\/+$/, "");
|
|
263
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
264
|
+
async function api(method, path, body, accept404 = false) {
|
|
265
|
+
const res = await fetchImpl(`${apiBase}${path}`, {
|
|
266
|
+
method,
|
|
267
|
+
headers: {
|
|
268
|
+
accept: "application/vnd.github+json",
|
|
269
|
+
"content-type": "application/json",
|
|
270
|
+
authorization: `Bearer ${opts.token}`,
|
|
271
|
+
"x-github-api-version": "2022-11-28"
|
|
272
|
+
},
|
|
273
|
+
body: body === void 0 ? void 0 : JSON.stringify(body)
|
|
274
|
+
});
|
|
275
|
+
if (accept404 && res.status === 404) return null;
|
|
276
|
+
if (!res.ok) {
|
|
277
|
+
const text = await res.text().catch(() => "");
|
|
278
|
+
throw new ConfigError(
|
|
279
|
+
`proposeAutomatedPullRequest: GitHub ${method} ${path} \u2192 ${res.status} ${text.slice(0, 400)}`
|
|
280
|
+
);
|
|
281
|
+
}
|
|
282
|
+
return await res.json();
|
|
283
|
+
}
|
|
284
|
+
return {
|
|
285
|
+
async proposeChange(input) {
|
|
286
|
+
const baseBranch = input.baseBranch ?? "main";
|
|
287
|
+
const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`;
|
|
288
|
+
if (input.dryRun) {
|
|
289
|
+
return {
|
|
290
|
+
prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
|
|
291
|
+
branchName: input.branchName,
|
|
292
|
+
headSha: "dry-run",
|
|
293
|
+
dryRun: true
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
const baseRef = await api("GET", `${repoPath}/git/ref/heads/${baseBranch}`);
|
|
297
|
+
if (!baseRef) {
|
|
298
|
+
throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`);
|
|
299
|
+
}
|
|
300
|
+
const baseSha = baseRef.object.sha;
|
|
301
|
+
const baseCommit = await api("GET", `${repoPath}/git/commits/${baseSha}`);
|
|
302
|
+
if (!baseCommit) {
|
|
303
|
+
throw new ConfigError(
|
|
304
|
+
`proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
const treeEntries = [];
|
|
308
|
+
for (const change of input.fileChanges) {
|
|
309
|
+
const blob = await api("POST", `${repoPath}/git/blobs`, {
|
|
310
|
+
content: change.contents,
|
|
311
|
+
encoding: "utf-8"
|
|
312
|
+
});
|
|
313
|
+
if (!blob) throw new ConfigError("proposeAutomatedPullRequest: blob creation returned null");
|
|
314
|
+
treeEntries.push({
|
|
315
|
+
path: change.path,
|
|
316
|
+
mode: "100644",
|
|
317
|
+
type: "blob",
|
|
318
|
+
sha: blob.sha
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
const tree = await api("POST", `${repoPath}/git/trees`, {
|
|
322
|
+
base_tree: baseCommit.tree.sha,
|
|
323
|
+
tree: treeEntries
|
|
324
|
+
});
|
|
325
|
+
if (!tree) throw new ConfigError("proposeAutomatedPullRequest: tree creation returned null");
|
|
326
|
+
const author = input.authorName && input.authorEmail ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() } : void 0;
|
|
327
|
+
const commitMessage = renderCommitMessage(input);
|
|
328
|
+
const commit = await api("POST", `${repoPath}/git/commits`, {
|
|
329
|
+
message: commitMessage,
|
|
330
|
+
tree: tree.sha,
|
|
331
|
+
parents: [baseSha],
|
|
332
|
+
...author ? { author, committer: author } : {}
|
|
333
|
+
});
|
|
334
|
+
if (!commit)
|
|
335
|
+
throw new ConfigError("proposeAutomatedPullRequest: commit creation returned null");
|
|
336
|
+
const existing = await api(
|
|
337
|
+
"GET",
|
|
338
|
+
`${repoPath}/git/ref/heads/${input.branchName}`,
|
|
339
|
+
void 0,
|
|
340
|
+
true
|
|
341
|
+
);
|
|
342
|
+
if (!existing) {
|
|
343
|
+
await api("POST", `${repoPath}/git/refs`, {
|
|
344
|
+
ref: `refs/heads/${input.branchName}`,
|
|
345
|
+
sha: commit.sha
|
|
346
|
+
});
|
|
347
|
+
} else if (existing.object.sha !== commit.sha) {
|
|
348
|
+
await api("PATCH", `${repoPath}/git/refs/heads/${input.branchName}`, {
|
|
349
|
+
sha: commit.sha,
|
|
350
|
+
force: true
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
const openPrs = await api(
|
|
354
|
+
"GET",
|
|
355
|
+
`${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`
|
|
356
|
+
);
|
|
357
|
+
let pr;
|
|
358
|
+
if (openPrs && openPrs.length > 0) {
|
|
359
|
+
pr = openPrs[0];
|
|
360
|
+
} else {
|
|
361
|
+
const created = await api("POST", `${repoPath}/pulls`, {
|
|
362
|
+
title: input.title,
|
|
363
|
+
body: input.body,
|
|
364
|
+
head: input.branchName,
|
|
365
|
+
base: baseBranch
|
|
366
|
+
});
|
|
367
|
+
if (!created)
|
|
368
|
+
throw new ConfigError("proposeAutomatedPullRequest: PR creation returned null");
|
|
369
|
+
pr = created;
|
|
370
|
+
}
|
|
371
|
+
if (input.reviewers && input.reviewers.length > 0) {
|
|
372
|
+
await api(
|
|
373
|
+
"POST",
|
|
374
|
+
`${repoPath}/pulls/${pr.number}/requested_reviewers`,
|
|
375
|
+
{ reviewers: input.reviewers },
|
|
376
|
+
true
|
|
377
|
+
).catch(() => {
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
if (input.labels && input.labels.length > 0) {
|
|
381
|
+
await api(
|
|
382
|
+
"POST",
|
|
383
|
+
`${repoPath}/issues/${pr.number}/labels`,
|
|
384
|
+
{ labels: input.labels },
|
|
385
|
+
true
|
|
386
|
+
).catch(() => {
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
return {
|
|
390
|
+
prUrl: pr.html_url,
|
|
391
|
+
branchName: input.branchName,
|
|
392
|
+
headSha: commit.sha,
|
|
393
|
+
dryRun: false
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
function ghCliClient(opts = {}) {
|
|
399
|
+
const bin = opts.bin ?? "gh";
|
|
400
|
+
const cwd = opts.cwd ?? process.cwd();
|
|
401
|
+
const exec = opts.exec ?? defaultExec;
|
|
402
|
+
async function run(cmd, args, stdin) {
|
|
403
|
+
const r = await exec(cmd, args, { cwd, stdin });
|
|
404
|
+
if (r.exitCode !== 0) {
|
|
405
|
+
throw new ConfigError(
|
|
406
|
+
`proposeAutomatedPullRequest: ${cmd} ${args.join(" ")} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`
|
|
407
|
+
);
|
|
408
|
+
}
|
|
409
|
+
return r;
|
|
410
|
+
}
|
|
411
|
+
return {
|
|
412
|
+
async proposeChange(input) {
|
|
413
|
+
const baseBranch = input.baseBranch ?? "main";
|
|
414
|
+
if (input.dryRun) {
|
|
415
|
+
return {
|
|
416
|
+
prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
|
|
417
|
+
branchName: input.branchName,
|
|
418
|
+
headSha: "dry-run",
|
|
419
|
+
dryRun: true
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
await run("git", ["fetch", "origin", baseBranch]);
|
|
423
|
+
await run("git", ["checkout", baseBranch]);
|
|
424
|
+
await run("git", ["reset", "--hard", `origin/${baseBranch}`]);
|
|
425
|
+
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
426
|
+
await run("git", ["checkout", "-b", input.branchName]);
|
|
427
|
+
const { mkdir, writeFile } = await import("fs/promises");
|
|
428
|
+
const { dirname: dirname5, join: join3, resolve } = await import("path");
|
|
429
|
+
for (const change of input.fileChanges) {
|
|
430
|
+
const abs = resolve(cwd, change.path);
|
|
431
|
+
await mkdir(dirname5(abs), { recursive: true });
|
|
432
|
+
await writeFile(abs, change.contents, "utf8");
|
|
433
|
+
await run("git", ["add", join3(change.path)]);
|
|
434
|
+
}
|
|
435
|
+
const env = {};
|
|
436
|
+
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
437
|
+
if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail;
|
|
438
|
+
if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName;
|
|
439
|
+
if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail;
|
|
440
|
+
const message = renderCommitMessage(input);
|
|
441
|
+
await run("git", ["commit", "-m", message]);
|
|
442
|
+
const headRes = await run("git", ["rev-parse", "HEAD"]);
|
|
443
|
+
const headSha = headRes.stdout.trim();
|
|
444
|
+
await run("git", ["push", "-f", "origin", input.branchName]);
|
|
445
|
+
const existing = await exec(
|
|
446
|
+
bin,
|
|
447
|
+
[
|
|
448
|
+
"pr",
|
|
449
|
+
"list",
|
|
450
|
+
"--state",
|
|
451
|
+
"open",
|
|
452
|
+
"--head",
|
|
453
|
+
input.branchName,
|
|
454
|
+
"--json",
|
|
455
|
+
"url,number",
|
|
456
|
+
"--limit",
|
|
457
|
+
"1"
|
|
458
|
+
],
|
|
459
|
+
{ cwd }
|
|
460
|
+
);
|
|
461
|
+
let prUrl = "";
|
|
462
|
+
if (existing.exitCode === 0 && existing.stdout.trim()) {
|
|
463
|
+
const parsed = JSON.parse(existing.stdout);
|
|
464
|
+
if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url;
|
|
465
|
+
}
|
|
466
|
+
if (!prUrl) {
|
|
467
|
+
const args = [
|
|
468
|
+
"pr",
|
|
469
|
+
"create",
|
|
470
|
+
"--title",
|
|
471
|
+
input.title,
|
|
472
|
+
"--body",
|
|
473
|
+
input.body,
|
|
474
|
+
"--base",
|
|
475
|
+
baseBranch
|
|
476
|
+
];
|
|
477
|
+
if (input.reviewers && input.reviewers.length > 0) {
|
|
478
|
+
args.push("--reviewer", input.reviewers.join(","));
|
|
479
|
+
}
|
|
480
|
+
if (input.labels && input.labels.length > 0) {
|
|
481
|
+
args.push("--label", input.labels.join(","));
|
|
482
|
+
}
|
|
483
|
+
const r = await run(bin, args);
|
|
484
|
+
const match = r.stdout.match(/https?:\/\/\S+/);
|
|
485
|
+
prUrl = match ? match[0] : r.stdout.trim();
|
|
486
|
+
}
|
|
487
|
+
return { prUrl, branchName: input.branchName, headSha, dryRun: false };
|
|
488
|
+
}
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
async function defaultExec(bin, args, opts) {
|
|
492
|
+
const { spawn } = await import("child_process");
|
|
493
|
+
return new Promise((resolveExec) => {
|
|
494
|
+
const child = spawn(bin, args, { cwd: opts.cwd });
|
|
495
|
+
let stdout = "";
|
|
496
|
+
let stderr = "";
|
|
497
|
+
child.stdout.on("data", (d) => {
|
|
498
|
+
stdout += d.toString();
|
|
499
|
+
});
|
|
500
|
+
child.stderr.on("data", (d) => {
|
|
501
|
+
stderr += d.toString();
|
|
502
|
+
});
|
|
503
|
+
if (opts.stdin) child.stdin.end(opts.stdin);
|
|
504
|
+
child.on("error", (err) => {
|
|
505
|
+
resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 });
|
|
506
|
+
});
|
|
507
|
+
child.on("close", (code) => {
|
|
508
|
+
resolveExec({ stdout, stderr, exitCode: code ?? 1 });
|
|
509
|
+
});
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
function renderCommitMessage(input) {
|
|
513
|
+
const lines = [input.title, ""];
|
|
514
|
+
for (const change of input.fileChanges) {
|
|
515
|
+
if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`);
|
|
516
|
+
}
|
|
517
|
+
if (lines[lines.length - 1] !== "") lines.push("");
|
|
518
|
+
lines.push(input.body.trim());
|
|
519
|
+
return lines.join("\n").trim();
|
|
520
|
+
}
|
|
521
|
+
|
|
223
522
|
// src/executor.ts
|
|
224
523
|
async function executeScenario(tc, scenario, config) {
|
|
225
524
|
const startTime = Date.now();
|
|
@@ -1534,6 +1833,396 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
|
|
|
1534
1833
|
};
|
|
1535
1834
|
}
|
|
1536
1835
|
|
|
1836
|
+
// src/production-loop.ts
|
|
1837
|
+
async function runProductionLoop(opts) {
|
|
1838
|
+
validate2(opts);
|
|
1839
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
1840
|
+
const startedAt = now().toISOString();
|
|
1841
|
+
const observedRuns = await opts.traceStore.listRuns();
|
|
1842
|
+
const observedFeedback = await opts.feedbackStore.list();
|
|
1843
|
+
const clusterReport = await failureClusterView(opts.traceStore, {
|
|
1844
|
+
minClusterSize: opts.cluster.minClusterSize ?? 1
|
|
1845
|
+
});
|
|
1846
|
+
const minSize = opts.cluster.minClusterSize ?? 5;
|
|
1847
|
+
const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
|
|
1848
|
+
const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
|
|
1849
|
+
const totalRuns = clusterReport.totalRuns;
|
|
1850
|
+
const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
|
|
1851
|
+
if (actionable.length === 0) {
|
|
1852
|
+
return finalize({
|
|
1853
|
+
opts,
|
|
1854
|
+
decision: "no_actionable_failures",
|
|
1855
|
+
startedAt,
|
|
1856
|
+
now,
|
|
1857
|
+
observedRunCount: observedRuns.length,
|
|
1858
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1859
|
+
clusters: clusterReport.clusters,
|
|
1860
|
+
actedOnCluster: null,
|
|
1861
|
+
evolution: null,
|
|
1862
|
+
release: null,
|
|
1863
|
+
gate: null,
|
|
1864
|
+
promotedPrompt: opts.evolve.baselinePrompt,
|
|
1865
|
+
pullRequest: null
|
|
1866
|
+
});
|
|
1867
|
+
}
|
|
1868
|
+
const actedOn = actionable[0];
|
|
1869
|
+
const baseline = {
|
|
1870
|
+
id: opts.evolve.baselineId ?? "baseline",
|
|
1871
|
+
label: opts.evolve.baselineId ?? "baseline",
|
|
1872
|
+
generation: 0,
|
|
1873
|
+
payload: opts.evolve.baselinePrompt
|
|
1874
|
+
};
|
|
1875
|
+
const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
|
|
1876
|
+
const searchIds = uniqueIds(
|
|
1877
|
+
(opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
|
|
1878
|
+
(s) => s.id
|
|
1879
|
+
)
|
|
1880
|
+
);
|
|
1881
|
+
if (searchIds.some((id) => holdoutIds.includes(id))) {
|
|
1882
|
+
throw new ValidationError(
|
|
1883
|
+
"runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1886
|
+
const reps = opts.evolve.reps ?? 3;
|
|
1887
|
+
const generations = opts.evolve.generations ?? 3;
|
|
1888
|
+
const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
|
|
1889
|
+
const evolution = await runMultiShotOptimization({
|
|
1890
|
+
runId: `${opts.runId}/evolve`,
|
|
1891
|
+
target: opts.target,
|
|
1892
|
+
seedVariants: [baseline],
|
|
1893
|
+
searchScenarioIds: searchIds,
|
|
1894
|
+
reps,
|
|
1895
|
+
generations,
|
|
1896
|
+
populationSize,
|
|
1897
|
+
scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
|
|
1898
|
+
runner: opts.evolve.runner,
|
|
1899
|
+
scorer: opts.evolve.scorer,
|
|
1900
|
+
mutateAdapter: opts.evolve.mutator,
|
|
1901
|
+
gate: {
|
|
1902
|
+
holdoutScenarioIds: holdoutIds,
|
|
1903
|
+
reps,
|
|
1904
|
+
gate: { ...opts.evolve.gate, baselineKey: baseline.id },
|
|
1905
|
+
toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
|
|
1906
|
+
runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
|
|
1907
|
+
variant,
|
|
1908
|
+
scenarioId,
|
|
1909
|
+
rep,
|
|
1910
|
+
split,
|
|
1911
|
+
seed,
|
|
1912
|
+
trial,
|
|
1913
|
+
target: opts.target
|
|
1914
|
+
}))
|
|
1915
|
+
}
|
|
1916
|
+
});
|
|
1917
|
+
const gate = evolution.gate?.decision ?? null;
|
|
1918
|
+
const promotedVariant = evolution.promotedVariant;
|
|
1919
|
+
const promoted = promotedVariant.payload;
|
|
1920
|
+
const promotedChanged = promotedVariant.id !== baseline.id;
|
|
1921
|
+
const allTrials = evolution.evolution.generations.flatMap(
|
|
1922
|
+
(g) => g.trials
|
|
1923
|
+
);
|
|
1924
|
+
const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
|
|
1925
|
+
const releaseScenarios = [
|
|
1926
|
+
...(opts.evolve.searchScenarios ?? []).map((s) => ({
|
|
1927
|
+
id: s.id,
|
|
1928
|
+
payload: s,
|
|
1929
|
+
split: "train",
|
|
1930
|
+
tags: { persona: s.persona, label: s.label }
|
|
1931
|
+
})),
|
|
1932
|
+
...opts.evolve.holdoutScenarios.map((s) => ({
|
|
1933
|
+
id: s.id,
|
|
1934
|
+
payload: s,
|
|
1935
|
+
split: "holdout",
|
|
1936
|
+
tags: { persona: s.persona, label: s.label }
|
|
1937
|
+
}))
|
|
1938
|
+
];
|
|
1939
|
+
const release = evaluateReleaseConfidence({
|
|
1940
|
+
target: opts.target,
|
|
1941
|
+
candidateId: promotedVariant.id,
|
|
1942
|
+
baselineId: baseline.id,
|
|
1943
|
+
scenarios: releaseScenarios,
|
|
1944
|
+
traces: traceEvidence,
|
|
1945
|
+
gateDecision: gate ?? void 0,
|
|
1946
|
+
thresholds: opts.releaseThresholds,
|
|
1947
|
+
runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
|
|
1948
|
+
});
|
|
1949
|
+
if (!promotedChanged) {
|
|
1950
|
+
return finalize({
|
|
1951
|
+
opts,
|
|
1952
|
+
decision: "evolve_yielded_no_improvement",
|
|
1953
|
+
startedAt,
|
|
1954
|
+
now,
|
|
1955
|
+
observedRunCount: observedRuns.length,
|
|
1956
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1957
|
+
clusters: clusterReport.clusters,
|
|
1958
|
+
actedOnCluster: actedOn,
|
|
1959
|
+
evolution,
|
|
1960
|
+
release,
|
|
1961
|
+
gate,
|
|
1962
|
+
promotedPrompt: promoted,
|
|
1963
|
+
pullRequest: null
|
|
1964
|
+
});
|
|
1965
|
+
}
|
|
1966
|
+
if (release.status === "fail" || gate && !gate.promote) {
|
|
1967
|
+
return finalize({
|
|
1968
|
+
opts,
|
|
1969
|
+
decision: "gate_failed",
|
|
1970
|
+
startedAt,
|
|
1971
|
+
now,
|
|
1972
|
+
observedRunCount: observedRuns.length,
|
|
1973
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1974
|
+
clusters: clusterReport.clusters,
|
|
1975
|
+
actedOnCluster: actedOn,
|
|
1976
|
+
evolution,
|
|
1977
|
+
release,
|
|
1978
|
+
gate,
|
|
1979
|
+
promotedPrompt: promoted,
|
|
1980
|
+
pullRequest: null
|
|
1981
|
+
});
|
|
1982
|
+
}
|
|
1983
|
+
if (!opts.ship) {
|
|
1984
|
+
return finalize({
|
|
1985
|
+
opts,
|
|
1986
|
+
decision: "proposed_change",
|
|
1987
|
+
startedAt,
|
|
1988
|
+
now,
|
|
1989
|
+
observedRunCount: observedRuns.length,
|
|
1990
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1991
|
+
clusters: clusterReport.clusters,
|
|
1992
|
+
actedOnCluster: actedOn,
|
|
1993
|
+
evolution,
|
|
1994
|
+
release,
|
|
1995
|
+
gate,
|
|
1996
|
+
promotedPrompt: promoted,
|
|
1997
|
+
pullRequest: null
|
|
1998
|
+
});
|
|
1999
|
+
}
|
|
2000
|
+
const baselineStr = toPromptString(baseline.payload);
|
|
2001
|
+
const promotedStr = toPromptString(promoted);
|
|
2002
|
+
const ctx = {
|
|
2003
|
+
runId: opts.runId,
|
|
2004
|
+
target: opts.target,
|
|
2005
|
+
decision: "pr_opened",
|
|
2006
|
+
clusters: clusterReport.clusters,
|
|
2007
|
+
actedOnCluster: actedOn,
|
|
2008
|
+
observedRunCount: observedRuns.length,
|
|
2009
|
+
observedFeedbackCount: observedFeedback.length,
|
|
2010
|
+
evolution,
|
|
2011
|
+
release,
|
|
2012
|
+
gate,
|
|
2013
|
+
baselinePromptString: baselineStr,
|
|
2014
|
+
promotedPromptString: promotedStr
|
|
2015
|
+
};
|
|
2016
|
+
const renderBody = opts.ship.renderBody ?? defaultRenderBody;
|
|
2017
|
+
const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
|
|
2018
|
+
`);
|
|
2019
|
+
const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
|
|
2020
|
+
const pr = await proposeAutomatedPullRequest(opts.ship.client, {
|
|
2021
|
+
repo: opts.ship.repo,
|
|
2022
|
+
baseBranch: opts.ship.baseBranch ?? "main",
|
|
2023
|
+
branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
|
|
2024
|
+
title: `${opts.target}: production-loop prompt update (${opts.runId})`,
|
|
2025
|
+
body: renderBody(ctx),
|
|
2026
|
+
reviewers: opts.ship.reviewers,
|
|
2027
|
+
labels: opts.ship.labels,
|
|
2028
|
+
fileChanges: [
|
|
2029
|
+
{
|
|
2030
|
+
path: opts.ship.promptFilePath,
|
|
2031
|
+
contents: renderFile(promotedStr, currentFile),
|
|
2032
|
+
rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
|
|
2033
|
+
}
|
|
2034
|
+
],
|
|
2035
|
+
dryRun: opts.ship.dryRun
|
|
2036
|
+
});
|
|
2037
|
+
return finalize({
|
|
2038
|
+
opts,
|
|
2039
|
+
decision: "pr_opened",
|
|
2040
|
+
startedAt,
|
|
2041
|
+
now,
|
|
2042
|
+
observedRunCount: observedRuns.length,
|
|
2043
|
+
observedFeedbackCount: observedFeedback.length,
|
|
2044
|
+
clusters: clusterReport.clusters,
|
|
2045
|
+
actedOnCluster: actedOn,
|
|
2046
|
+
evolution,
|
|
2047
|
+
release,
|
|
2048
|
+
gate,
|
|
2049
|
+
promotedPrompt: promoted,
|
|
2050
|
+
pullRequest: pr
|
|
2051
|
+
});
|
|
2052
|
+
}
|
|
2053
|
+
function finalize(args) {
|
|
2054
|
+
return {
|
|
2055
|
+
runId: args.opts.runId,
|
|
2056
|
+
target: args.opts.target,
|
|
2057
|
+
decision: args.decision,
|
|
2058
|
+
startedAt: args.startedAt,
|
|
2059
|
+
finishedAt: args.now().toISOString(),
|
|
2060
|
+
observedRunCount: args.observedRunCount,
|
|
2061
|
+
observedFeedbackCount: args.observedFeedbackCount,
|
|
2062
|
+
clusters: args.clusters,
|
|
2063
|
+
actedOnCluster: args.actedOnCluster,
|
|
2064
|
+
evolution: args.evolution,
|
|
2065
|
+
release: args.release,
|
|
2066
|
+
gate: args.gate,
|
|
2067
|
+
baselinePrompt: args.opts.evolve.baselinePrompt,
|
|
2068
|
+
promotedPrompt: args.promotedPrompt,
|
|
2069
|
+
pullRequest: args.pullRequest,
|
|
2070
|
+
cron: args.opts.cron ?? null
|
|
2071
|
+
};
|
|
2072
|
+
}
|
|
2073
|
+
function validate2(opts) {
|
|
2074
|
+
if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
|
|
2075
|
+
if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
|
|
2076
|
+
if (opts.evolve.holdoutScenarios.length === 0) {
|
|
2077
|
+
throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
|
|
2078
|
+
}
|
|
2079
|
+
if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
|
|
2080
|
+
throw new ValidationError(
|
|
2081
|
+
"runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
|
|
2082
|
+
);
|
|
2083
|
+
}
|
|
2084
|
+
if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
|
|
2085
|
+
}
|
|
2086
|
+
if (opts.ship) {
|
|
2087
|
+
if (!opts.ship.branchPrefix.trim()) {
|
|
2088
|
+
throw new ValidationError("runProductionLoop: ship.branchPrefix required");
|
|
2089
|
+
}
|
|
2090
|
+
if (!opts.ship.promptFilePath.trim()) {
|
|
2091
|
+
throw new ValidationError("runProductionLoop: ship.promptFilePath required");
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
}
|
|
2095
|
+
function uniqueIds(ids) {
|
|
2096
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2097
|
+
const out = [];
|
|
2098
|
+
for (const id of ids) {
|
|
2099
|
+
if (seen.has(id)) continue;
|
|
2100
|
+
seen.add(id);
|
|
2101
|
+
out.push(id);
|
|
2102
|
+
}
|
|
2103
|
+
return out;
|
|
2104
|
+
}
|
|
2105
|
+
function deriveSearchScenarios(holdout) {
|
|
2106
|
+
if (holdout.length < 4) {
|
|
2107
|
+
return [
|
|
2108
|
+
{
|
|
2109
|
+
...holdout[0],
|
|
2110
|
+
id: `${holdout[0].id}__search`
|
|
2111
|
+
}
|
|
2112
|
+
];
|
|
2113
|
+
}
|
|
2114
|
+
return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
|
|
2115
|
+
}
|
|
2116
|
+
function syntheticRunRecord(input) {
|
|
2117
|
+
const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
|
|
2118
|
+
return {
|
|
2119
|
+
runId: input.runId,
|
|
2120
|
+
experimentId: input.target,
|
|
2121
|
+
candidateId: input.variant.id,
|
|
2122
|
+
seed: input.seed,
|
|
2123
|
+
model: "production-loop@synthetic",
|
|
2124
|
+
promptHash: "0".repeat(64),
|
|
2125
|
+
configHash: "0".repeat(64),
|
|
2126
|
+
commitSha: "0".repeat(40),
|
|
2127
|
+
wallMs: input.trial.durationMs ?? 1,
|
|
2128
|
+
costUsd: input.trial.cost ?? 0,
|
|
2129
|
+
tokenUsage: { input: 0, output: 0 },
|
|
2130
|
+
outcome: {
|
|
2131
|
+
[scoreKey]: input.trial.score,
|
|
2132
|
+
raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
|
|
2133
|
+
},
|
|
2134
|
+
splitTag: input.split,
|
|
2135
|
+
scenarioId: input.scenarioId
|
|
2136
|
+
};
|
|
2137
|
+
}
|
|
2138
|
+
function toPromptString(payload) {
|
|
2139
|
+
if (typeof payload === "string") return payload;
|
|
2140
|
+
if (payload == null) return "";
|
|
2141
|
+
try {
|
|
2142
|
+
return JSON.stringify(payload, null, 2);
|
|
2143
|
+
} catch {
|
|
2144
|
+
return String(payload);
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
function defaultRenderBody(ctx) {
|
|
2148
|
+
const cluster = ctx.actedOnCluster;
|
|
2149
|
+
const release = ctx.release;
|
|
2150
|
+
const gate = ctx.gate;
|
|
2151
|
+
const lines = [];
|
|
2152
|
+
lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
|
|
2153
|
+
lines.push("");
|
|
2154
|
+
lines.push(`Run id: \`${ctx.runId}\``);
|
|
2155
|
+
lines.push(`Decision: \`${ctx.decision}\``);
|
|
2156
|
+
lines.push(
|
|
2157
|
+
`Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
|
|
2158
|
+
);
|
|
2159
|
+
lines.push("");
|
|
2160
|
+
if (cluster) {
|
|
2161
|
+
lines.push("### Triggering failure cluster");
|
|
2162
|
+
lines.push("");
|
|
2163
|
+
lines.push(`- **class**: \`${cluster.failureClass}\``);
|
|
2164
|
+
lines.push(`- **runs in cluster**: ${cluster.runCount}`);
|
|
2165
|
+
lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
|
|
2166
|
+
if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
|
|
2167
|
+
if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
|
|
2168
|
+
if (cluster.exampleError) {
|
|
2169
|
+
lines.push(
|
|
2170
|
+
`- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
|
|
2171
|
+
);
|
|
2172
|
+
}
|
|
2173
|
+
lines.push("");
|
|
2174
|
+
}
|
|
2175
|
+
if (gate) {
|
|
2176
|
+
lines.push("### Held-out promotion gate");
|
|
2177
|
+
lines.push("");
|
|
2178
|
+
lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
|
|
2179
|
+
lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
|
|
2180
|
+
lines.push(
|
|
2181
|
+
`- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
|
|
2182
|
+
);
|
|
2183
|
+
lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
|
|
2184
|
+
lines.push(
|
|
2185
|
+
`- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
|
|
2186
|
+
);
|
|
2187
|
+
lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
|
|
2188
|
+
lines.push("");
|
|
2189
|
+
}
|
|
2190
|
+
if (release) {
|
|
2191
|
+
lines.push("### Release confidence");
|
|
2192
|
+
lines.push("");
|
|
2193
|
+
lines.push(`- **status**: \`${release.status}\``);
|
|
2194
|
+
lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
|
|
2195
|
+
lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
|
|
2196
|
+
if (release.issues.length > 0) {
|
|
2197
|
+
lines.push("- **issues**:");
|
|
2198
|
+
for (const issue of release.issues) {
|
|
2199
|
+
lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
|
|
2200
|
+
}
|
|
2201
|
+
}
|
|
2202
|
+
lines.push("");
|
|
2203
|
+
}
|
|
2204
|
+
lines.push("### Prompt diff");
|
|
2205
|
+
lines.push("");
|
|
2206
|
+
lines.push("```diff");
|
|
2207
|
+
lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
|
|
2208
|
+
lines.push("```");
|
|
2209
|
+
return lines.join("\n");
|
|
2210
|
+
}
|
|
2211
|
+
function unifiedDiff(a, b) {
|
|
2212
|
+
const aLines = a.split("\n");
|
|
2213
|
+
const bLines = b.split("\n");
|
|
2214
|
+
const out = [];
|
|
2215
|
+
const max = Math.max(aLines.length, bLines.length);
|
|
2216
|
+
for (let i = 0; i < max; i++) {
|
|
2217
|
+
const al = aLines[i];
|
|
2218
|
+
const bl = bLines[i];
|
|
2219
|
+
if (al === bl) continue;
|
|
2220
|
+
if (al !== void 0) out.push(`- ${al}`);
|
|
2221
|
+
if (bl !== void 0) out.push(`+ ${bl}`);
|
|
2222
|
+
}
|
|
2223
|
+
return out.join("\n");
|
|
2224
|
+
}
|
|
2225
|
+
|
|
1537
2226
|
// src/registry.ts
|
|
1538
2227
|
var ScenarioRegistry = class {
|
|
1539
2228
|
scenarios = [];
|
|
@@ -8417,6 +9106,7 @@ export {
|
|
|
8417
9106
|
formatDriverReport,
|
|
8418
9107
|
formatFindings,
|
|
8419
9108
|
gainHistogram,
|
|
9109
|
+
ghCliClient,
|
|
8420
9110
|
precision as goldenPrecision,
|
|
8421
9111
|
gradeSemanticStatus,
|
|
8422
9112
|
groupBy,
|
|
@@ -8424,6 +9114,7 @@ export {
|
|
|
8424
9114
|
hashJson,
|
|
8425
9115
|
hashScenarios,
|
|
8426
9116
|
htmlContainsElement,
|
|
9117
|
+
httpGithubClient,
|
|
8427
9118
|
inMemoryReferenceReplayStore,
|
|
8428
9119
|
inMemoryReviewStore,
|
|
8429
9120
|
integrationAsi,
|
|
@@ -8484,6 +9175,7 @@ export {
|
|
|
8484
9175
|
printDriverSummary,
|
|
8485
9176
|
probeLlm,
|
|
8486
9177
|
promptBisect,
|
|
9178
|
+
proposeAutomatedPullRequest,
|
|
8487
9179
|
proposeSynthesisTargets,
|
|
8488
9180
|
providerFromBaseUrl,
|
|
8489
9181
|
pytestTestParser,
|
|
@@ -8528,6 +9220,7 @@ export {
|
|
|
8528
9220
|
runKeywordCoverageJudgeUrl,
|
|
8529
9221
|
runLiveProof,
|
|
8530
9222
|
runMultiShotOptimization,
|
|
9223
|
+
runProductionLoop,
|
|
8531
9224
|
runPromptEvolution,
|
|
8532
9225
|
runProposeReview,
|
|
8533
9226
|
runProposeReviewAsControlLoop,
|