@tangle-network/agent-eval 0.24.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +138 -0
- package/README.md +72 -0
- package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/{chunk-VRJVTXRV.js → chunk-WHZMVFUV.js} +85 -85
- package/dist/chunk-WHZMVFUV.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-D3iBCjdF.d.ts} +63 -2
- package/dist/index.d.ts +529 -12
- package/dist/index.js +1106 -17
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/pipelines/index.js +3 -67
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{release-report-TDPn1cxq.d.ts → release-report-wfUySN5F.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CUOiGcGv.d.ts → researcher-bGkI7vCl.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-DZVXOCK_.d.ts} +13 -1
- package/dist/wire/index.d.ts +347 -3
- package/dist/wire/index.js +19 -1
- package/docs/concepts.md +11 -0
- package/package.json +1 -1
- package/dist/chunk-OHEPNJQN.js.map +0 -1
- package/dist/chunk-SY6WAAAD.js.map +0 -1
- package/dist/chunk-VRJVTXRV.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -8,9 +8,10 @@ import {
|
|
|
8
8
|
classifyFailure,
|
|
9
9
|
compareToBaseline,
|
|
10
10
|
computeToolUseMetrics,
|
|
11
|
+
failureClusterView,
|
|
11
12
|
iqr,
|
|
12
13
|
welchsTTest
|
|
13
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-JLZQWFV3.js";
|
|
14
15
|
import {
|
|
15
16
|
exportTrainingData,
|
|
16
17
|
toNdjson
|
|
@@ -95,7 +96,7 @@ import {
|
|
|
95
96
|
summarizePreferenceMemory,
|
|
96
97
|
trialTraceFromMultiShotTrial,
|
|
97
98
|
withAssignedFeedbackSplit
|
|
98
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-WHZMVFUV.js";
|
|
99
100
|
import {
|
|
100
101
|
RunRecordValidationError,
|
|
101
102
|
isRunRecord,
|
|
@@ -220,6 +221,304 @@ import {
|
|
|
220
221
|
} from "./chunk-NG236HPC.js";
|
|
221
222
|
import "./chunk-PZ5AY32C.js";
|
|
222
223
|
|
|
224
|
+
// src/auto-pr.ts
|
|
225
|
+
async function proposeAutomatedPullRequest(client, input) {
|
|
226
|
+
validate(input);
|
|
227
|
+
return client.proposeChange(input);
|
|
228
|
+
}
|
|
229
|
+
function validate(input) {
|
|
230
|
+
if (!input.repo.owner.trim() || !input.repo.name.trim()) {
|
|
231
|
+
throw new ValidationError("proposeAutomatedPullRequest: repo.owner and repo.name required");
|
|
232
|
+
}
|
|
233
|
+
if (!input.branchName.trim() || /\s/.test(input.branchName)) {
|
|
234
|
+
throw new ValidationError(
|
|
235
|
+
"proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace"
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
if (input.branchName === (input.baseBranch ?? "main")) {
|
|
239
|
+
throw new ValidationError("proposeAutomatedPullRequest: branchName must differ from baseBranch");
|
|
240
|
+
}
|
|
241
|
+
if (input.fileChanges.length === 0) {
|
|
242
|
+
throw new ValidationError("proposeAutomatedPullRequest: fileChanges must not be empty");
|
|
243
|
+
}
|
|
244
|
+
const seenPaths = /* @__PURE__ */ new Set();
|
|
245
|
+
for (const change of input.fileChanges) {
|
|
246
|
+
if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
|
|
247
|
+
throw new ValidationError(
|
|
248
|
+
`proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
|
|
249
|
+
);
|
|
250
|
+
}
|
|
251
|
+
if (seenPaths.has(change.path)) {
|
|
252
|
+
throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
|
|
253
|
+
}
|
|
254
|
+
seenPaths.add(change.path);
|
|
255
|
+
}
|
|
256
|
+
if (!input.title.trim()) {
|
|
257
|
+
throw new ValidationError("proposeAutomatedPullRequest: title must not be empty");
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
function httpGithubClient(opts) {
|
|
261
|
+
const fetchImpl = opts.fetchImpl ?? fetch;
|
|
262
|
+
const apiBase = (opts.apiBase ?? "https://api.github.com").replace(/\/+$/, "");
|
|
263
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
264
|
+
async function api(method, path, body, accept404 = false) {
|
|
265
|
+
const res = await fetchImpl(`${apiBase}${path}`, {
|
|
266
|
+
method,
|
|
267
|
+
headers: {
|
|
268
|
+
accept: "application/vnd.github+json",
|
|
269
|
+
"content-type": "application/json",
|
|
270
|
+
authorization: `Bearer ${opts.token}`,
|
|
271
|
+
"x-github-api-version": "2022-11-28"
|
|
272
|
+
},
|
|
273
|
+
body: body === void 0 ? void 0 : JSON.stringify(body)
|
|
274
|
+
});
|
|
275
|
+
if (accept404 && res.status === 404) return null;
|
|
276
|
+
if (!res.ok) {
|
|
277
|
+
const text = await res.text().catch(() => "");
|
|
278
|
+
throw new ConfigError(
|
|
279
|
+
`proposeAutomatedPullRequest: GitHub ${method} ${path} \u2192 ${res.status} ${text.slice(0, 400)}`
|
|
280
|
+
);
|
|
281
|
+
}
|
|
282
|
+
return await res.json();
|
|
283
|
+
}
|
|
284
|
+
return {
|
|
285
|
+
async proposeChange(input) {
|
|
286
|
+
const baseBranch = input.baseBranch ?? "main";
|
|
287
|
+
const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`;
|
|
288
|
+
if (input.dryRun) {
|
|
289
|
+
return {
|
|
290
|
+
prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
|
|
291
|
+
branchName: input.branchName,
|
|
292
|
+
headSha: "dry-run",
|
|
293
|
+
dryRun: true
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
const baseRef = await api("GET", `${repoPath}/git/ref/heads/${baseBranch}`);
|
|
297
|
+
if (!baseRef) {
|
|
298
|
+
throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`);
|
|
299
|
+
}
|
|
300
|
+
const baseSha = baseRef.object.sha;
|
|
301
|
+
const baseCommit = await api("GET", `${repoPath}/git/commits/${baseSha}`);
|
|
302
|
+
if (!baseCommit) {
|
|
303
|
+
throw new ConfigError(
|
|
304
|
+
`proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
const treeEntries = [];
|
|
308
|
+
for (const change of input.fileChanges) {
|
|
309
|
+
const blob = await api("POST", `${repoPath}/git/blobs`, {
|
|
310
|
+
content: change.contents,
|
|
311
|
+
encoding: "utf-8"
|
|
312
|
+
});
|
|
313
|
+
if (!blob) throw new ConfigError("proposeAutomatedPullRequest: blob creation returned null");
|
|
314
|
+
treeEntries.push({
|
|
315
|
+
path: change.path,
|
|
316
|
+
mode: "100644",
|
|
317
|
+
type: "blob",
|
|
318
|
+
sha: blob.sha
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
const tree = await api("POST", `${repoPath}/git/trees`, {
|
|
322
|
+
base_tree: baseCommit.tree.sha,
|
|
323
|
+
tree: treeEntries
|
|
324
|
+
});
|
|
325
|
+
if (!tree) throw new ConfigError("proposeAutomatedPullRequest: tree creation returned null");
|
|
326
|
+
const author = input.authorName && input.authorEmail ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() } : void 0;
|
|
327
|
+
const commitMessage = renderCommitMessage(input);
|
|
328
|
+
const commit = await api("POST", `${repoPath}/git/commits`, {
|
|
329
|
+
message: commitMessage,
|
|
330
|
+
tree: tree.sha,
|
|
331
|
+
parents: [baseSha],
|
|
332
|
+
...author ? { author, committer: author } : {}
|
|
333
|
+
});
|
|
334
|
+
if (!commit)
|
|
335
|
+
throw new ConfigError("proposeAutomatedPullRequest: commit creation returned null");
|
|
336
|
+
const existing = await api(
|
|
337
|
+
"GET",
|
|
338
|
+
`${repoPath}/git/ref/heads/${input.branchName}`,
|
|
339
|
+
void 0,
|
|
340
|
+
true
|
|
341
|
+
);
|
|
342
|
+
if (!existing) {
|
|
343
|
+
await api("POST", `${repoPath}/git/refs`, {
|
|
344
|
+
ref: `refs/heads/${input.branchName}`,
|
|
345
|
+
sha: commit.sha
|
|
346
|
+
});
|
|
347
|
+
} else if (existing.object.sha !== commit.sha) {
|
|
348
|
+
await api("PATCH", `${repoPath}/git/refs/heads/${input.branchName}`, {
|
|
349
|
+
sha: commit.sha,
|
|
350
|
+
force: true
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
const openPrs = await api(
|
|
354
|
+
"GET",
|
|
355
|
+
`${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`
|
|
356
|
+
);
|
|
357
|
+
let pr;
|
|
358
|
+
if (openPrs && openPrs.length > 0) {
|
|
359
|
+
pr = openPrs[0];
|
|
360
|
+
} else {
|
|
361
|
+
const created = await api("POST", `${repoPath}/pulls`, {
|
|
362
|
+
title: input.title,
|
|
363
|
+
body: input.body,
|
|
364
|
+
head: input.branchName,
|
|
365
|
+
base: baseBranch
|
|
366
|
+
});
|
|
367
|
+
if (!created)
|
|
368
|
+
throw new ConfigError("proposeAutomatedPullRequest: PR creation returned null");
|
|
369
|
+
pr = created;
|
|
370
|
+
}
|
|
371
|
+
if (input.reviewers && input.reviewers.length > 0) {
|
|
372
|
+
await api(
|
|
373
|
+
"POST",
|
|
374
|
+
`${repoPath}/pulls/${pr.number}/requested_reviewers`,
|
|
375
|
+
{ reviewers: input.reviewers },
|
|
376
|
+
true
|
|
377
|
+
).catch(() => {
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
if (input.labels && input.labels.length > 0) {
|
|
381
|
+
await api(
|
|
382
|
+
"POST",
|
|
383
|
+
`${repoPath}/issues/${pr.number}/labels`,
|
|
384
|
+
{ labels: input.labels },
|
|
385
|
+
true
|
|
386
|
+
).catch(() => {
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
return {
|
|
390
|
+
prUrl: pr.html_url,
|
|
391
|
+
branchName: input.branchName,
|
|
392
|
+
headSha: commit.sha,
|
|
393
|
+
dryRun: false
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
function ghCliClient(opts = {}) {
|
|
399
|
+
const bin = opts.bin ?? "gh";
|
|
400
|
+
const cwd = opts.cwd ?? process.cwd();
|
|
401
|
+
const exec = opts.exec ?? defaultExec;
|
|
402
|
+
async function run(cmd, args, stdin) {
|
|
403
|
+
const r = await exec(cmd, args, { cwd, stdin });
|
|
404
|
+
if (r.exitCode !== 0) {
|
|
405
|
+
throw new ConfigError(
|
|
406
|
+
`proposeAutomatedPullRequest: ${cmd} ${args.join(" ")} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`
|
|
407
|
+
);
|
|
408
|
+
}
|
|
409
|
+
return r;
|
|
410
|
+
}
|
|
411
|
+
return {
|
|
412
|
+
async proposeChange(input) {
|
|
413
|
+
const baseBranch = input.baseBranch ?? "main";
|
|
414
|
+
if (input.dryRun) {
|
|
415
|
+
return {
|
|
416
|
+
prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
|
|
417
|
+
branchName: input.branchName,
|
|
418
|
+
headSha: "dry-run",
|
|
419
|
+
dryRun: true
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
await run("git", ["fetch", "origin", baseBranch]);
|
|
423
|
+
await run("git", ["checkout", baseBranch]);
|
|
424
|
+
await run("git", ["reset", "--hard", `origin/${baseBranch}`]);
|
|
425
|
+
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
426
|
+
await run("git", ["checkout", "-b", input.branchName]);
|
|
427
|
+
const { mkdir, writeFile } = await import("fs/promises");
|
|
428
|
+
const { dirname: dirname5, join: join4, resolve } = await import("path");
|
|
429
|
+
for (const change of input.fileChanges) {
|
|
430
|
+
const abs = resolve(cwd, change.path);
|
|
431
|
+
await mkdir(dirname5(abs), { recursive: true });
|
|
432
|
+
await writeFile(abs, change.contents, "utf8");
|
|
433
|
+
await run("git", ["add", join4(change.path)]);
|
|
434
|
+
}
|
|
435
|
+
const env = {};
|
|
436
|
+
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
437
|
+
if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail;
|
|
438
|
+
if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName;
|
|
439
|
+
if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail;
|
|
440
|
+
const message = renderCommitMessage(input);
|
|
441
|
+
await run("git", ["commit", "-m", message]);
|
|
442
|
+
const headRes = await run("git", ["rev-parse", "HEAD"]);
|
|
443
|
+
const headSha = headRes.stdout.trim();
|
|
444
|
+
await run("git", ["push", "-f", "origin", input.branchName]);
|
|
445
|
+
const existing = await exec(
|
|
446
|
+
bin,
|
|
447
|
+
[
|
|
448
|
+
"pr",
|
|
449
|
+
"list",
|
|
450
|
+
"--state",
|
|
451
|
+
"open",
|
|
452
|
+
"--head",
|
|
453
|
+
input.branchName,
|
|
454
|
+
"--json",
|
|
455
|
+
"url,number",
|
|
456
|
+
"--limit",
|
|
457
|
+
"1"
|
|
458
|
+
],
|
|
459
|
+
{ cwd }
|
|
460
|
+
);
|
|
461
|
+
let prUrl = "";
|
|
462
|
+
if (existing.exitCode === 0 && existing.stdout.trim()) {
|
|
463
|
+
const parsed = JSON.parse(existing.stdout);
|
|
464
|
+
if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url;
|
|
465
|
+
}
|
|
466
|
+
if (!prUrl) {
|
|
467
|
+
const args = [
|
|
468
|
+
"pr",
|
|
469
|
+
"create",
|
|
470
|
+
"--title",
|
|
471
|
+
input.title,
|
|
472
|
+
"--body",
|
|
473
|
+
input.body,
|
|
474
|
+
"--base",
|
|
475
|
+
baseBranch
|
|
476
|
+
];
|
|
477
|
+
if (input.reviewers && input.reviewers.length > 0) {
|
|
478
|
+
args.push("--reviewer", input.reviewers.join(","));
|
|
479
|
+
}
|
|
480
|
+
if (input.labels && input.labels.length > 0) {
|
|
481
|
+
args.push("--label", input.labels.join(","));
|
|
482
|
+
}
|
|
483
|
+
const r = await run(bin, args);
|
|
484
|
+
const match = r.stdout.match(/https?:\/\/\S+/);
|
|
485
|
+
prUrl = match ? match[0] : r.stdout.trim();
|
|
486
|
+
}
|
|
487
|
+
return { prUrl, branchName: input.branchName, headSha, dryRun: false };
|
|
488
|
+
}
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
async function defaultExec(bin, args, opts) {
|
|
492
|
+
const { spawn } = await import("child_process");
|
|
493
|
+
return new Promise((resolveExec) => {
|
|
494
|
+
const child = spawn(bin, args, { cwd: opts.cwd });
|
|
495
|
+
let stdout = "";
|
|
496
|
+
let stderr = "";
|
|
497
|
+
child.stdout.on("data", (d) => {
|
|
498
|
+
stdout += d.toString();
|
|
499
|
+
});
|
|
500
|
+
child.stderr.on("data", (d) => {
|
|
501
|
+
stderr += d.toString();
|
|
502
|
+
});
|
|
503
|
+
if (opts.stdin) child.stdin.end(opts.stdin);
|
|
504
|
+
child.on("error", (err) => {
|
|
505
|
+
resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 });
|
|
506
|
+
});
|
|
507
|
+
child.on("close", (code) => {
|
|
508
|
+
resolveExec({ stdout, stderr, exitCode: code ?? 1 });
|
|
509
|
+
});
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
function renderCommitMessage(input) {
|
|
513
|
+
const lines = [input.title, ""];
|
|
514
|
+
for (const change of input.fileChanges) {
|
|
515
|
+
if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`);
|
|
516
|
+
}
|
|
517
|
+
if (lines[lines.length - 1] !== "") lines.push("");
|
|
518
|
+
lines.push(input.body.trim());
|
|
519
|
+
return lines.join("\n").trim();
|
|
520
|
+
}
|
|
521
|
+
|
|
223
522
|
// src/executor.ts
|
|
224
523
|
async function executeScenario(tc, scenario, config) {
|
|
225
524
|
const startTime = Date.now();
|
|
@@ -1534,6 +1833,396 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
|
|
|
1534
1833
|
};
|
|
1535
1834
|
}
|
|
1536
1835
|
|
|
1836
|
+
// src/production-loop.ts
|
|
1837
|
+
async function runProductionLoop(opts) {
|
|
1838
|
+
validate2(opts);
|
|
1839
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
1840
|
+
const startedAt = now().toISOString();
|
|
1841
|
+
const observedRuns = await opts.traceStore.listRuns();
|
|
1842
|
+
const observedFeedback = await opts.feedbackStore.list();
|
|
1843
|
+
const clusterReport = await failureClusterView(opts.traceStore, {
|
|
1844
|
+
minClusterSize: opts.cluster.minClusterSize ?? 1
|
|
1845
|
+
});
|
|
1846
|
+
const minSize = opts.cluster.minClusterSize ?? 5;
|
|
1847
|
+
const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
|
|
1848
|
+
const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
|
|
1849
|
+
const totalRuns = clusterReport.totalRuns;
|
|
1850
|
+
const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
|
|
1851
|
+
if (actionable.length === 0) {
|
|
1852
|
+
return finalize({
|
|
1853
|
+
opts,
|
|
1854
|
+
decision: "no_actionable_failures",
|
|
1855
|
+
startedAt,
|
|
1856
|
+
now,
|
|
1857
|
+
observedRunCount: observedRuns.length,
|
|
1858
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1859
|
+
clusters: clusterReport.clusters,
|
|
1860
|
+
actedOnCluster: null,
|
|
1861
|
+
evolution: null,
|
|
1862
|
+
release: null,
|
|
1863
|
+
gate: null,
|
|
1864
|
+
promotedPrompt: opts.evolve.baselinePrompt,
|
|
1865
|
+
pullRequest: null
|
|
1866
|
+
});
|
|
1867
|
+
}
|
|
1868
|
+
const actedOn = actionable[0];
|
|
1869
|
+
const baseline = {
|
|
1870
|
+
id: opts.evolve.baselineId ?? "baseline",
|
|
1871
|
+
label: opts.evolve.baselineId ?? "baseline",
|
|
1872
|
+
generation: 0,
|
|
1873
|
+
payload: opts.evolve.baselinePrompt
|
|
1874
|
+
};
|
|
1875
|
+
const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
|
|
1876
|
+
const searchIds = uniqueIds(
|
|
1877
|
+
(opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
|
|
1878
|
+
(s) => s.id
|
|
1879
|
+
)
|
|
1880
|
+
);
|
|
1881
|
+
if (searchIds.some((id) => holdoutIds.includes(id))) {
|
|
1882
|
+
throw new ValidationError(
|
|
1883
|
+
"runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
|
|
1884
|
+
);
|
|
1885
|
+
}
|
|
1886
|
+
const reps = opts.evolve.reps ?? 3;
|
|
1887
|
+
const generations = opts.evolve.generations ?? 3;
|
|
1888
|
+
const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
|
|
1889
|
+
const evolution = await runMultiShotOptimization({
|
|
1890
|
+
runId: `${opts.runId}/evolve`,
|
|
1891
|
+
target: opts.target,
|
|
1892
|
+
seedVariants: [baseline],
|
|
1893
|
+
searchScenarioIds: searchIds,
|
|
1894
|
+
reps,
|
|
1895
|
+
generations,
|
|
1896
|
+
populationSize,
|
|
1897
|
+
scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
|
|
1898
|
+
runner: opts.evolve.runner,
|
|
1899
|
+
scorer: opts.evolve.scorer,
|
|
1900
|
+
mutateAdapter: opts.evolve.mutator,
|
|
1901
|
+
gate: {
|
|
1902
|
+
holdoutScenarioIds: holdoutIds,
|
|
1903
|
+
reps,
|
|
1904
|
+
gate: { ...opts.evolve.gate, baselineKey: baseline.id },
|
|
1905
|
+
toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
|
|
1906
|
+
runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
|
|
1907
|
+
variant,
|
|
1908
|
+
scenarioId,
|
|
1909
|
+
rep,
|
|
1910
|
+
split,
|
|
1911
|
+
seed,
|
|
1912
|
+
trial,
|
|
1913
|
+
target: opts.target
|
|
1914
|
+
}))
|
|
1915
|
+
}
|
|
1916
|
+
});
|
|
1917
|
+
const gate = evolution.gate?.decision ?? null;
|
|
1918
|
+
const promotedVariant = evolution.promotedVariant;
|
|
1919
|
+
const promoted = promotedVariant.payload;
|
|
1920
|
+
const promotedChanged = promotedVariant.id !== baseline.id;
|
|
1921
|
+
const allTrials = evolution.evolution.generations.flatMap(
|
|
1922
|
+
(g) => g.trials
|
|
1923
|
+
);
|
|
1924
|
+
const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
|
|
1925
|
+
const releaseScenarios = [
|
|
1926
|
+
...(opts.evolve.searchScenarios ?? []).map((s) => ({
|
|
1927
|
+
id: s.id,
|
|
1928
|
+
payload: s,
|
|
1929
|
+
split: "train",
|
|
1930
|
+
tags: { persona: s.persona, label: s.label }
|
|
1931
|
+
})),
|
|
1932
|
+
...opts.evolve.holdoutScenarios.map((s) => ({
|
|
1933
|
+
id: s.id,
|
|
1934
|
+
payload: s,
|
|
1935
|
+
split: "holdout",
|
|
1936
|
+
tags: { persona: s.persona, label: s.label }
|
|
1937
|
+
}))
|
|
1938
|
+
];
|
|
1939
|
+
const release = evaluateReleaseConfidence({
|
|
1940
|
+
target: opts.target,
|
|
1941
|
+
candidateId: promotedVariant.id,
|
|
1942
|
+
baselineId: baseline.id,
|
|
1943
|
+
scenarios: releaseScenarios,
|
|
1944
|
+
traces: traceEvidence,
|
|
1945
|
+
gateDecision: gate ?? void 0,
|
|
1946
|
+
thresholds: opts.releaseThresholds,
|
|
1947
|
+
runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
|
|
1948
|
+
});
|
|
1949
|
+
if (!promotedChanged) {
|
|
1950
|
+
return finalize({
|
|
1951
|
+
opts,
|
|
1952
|
+
decision: "evolve_yielded_no_improvement",
|
|
1953
|
+
startedAt,
|
|
1954
|
+
now,
|
|
1955
|
+
observedRunCount: observedRuns.length,
|
|
1956
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1957
|
+
clusters: clusterReport.clusters,
|
|
1958
|
+
actedOnCluster: actedOn,
|
|
1959
|
+
evolution,
|
|
1960
|
+
release,
|
|
1961
|
+
gate,
|
|
1962
|
+
promotedPrompt: promoted,
|
|
1963
|
+
pullRequest: null
|
|
1964
|
+
});
|
|
1965
|
+
}
|
|
1966
|
+
if (release.status === "fail" || gate && !gate.promote) {
|
|
1967
|
+
return finalize({
|
|
1968
|
+
opts,
|
|
1969
|
+
decision: "gate_failed",
|
|
1970
|
+
startedAt,
|
|
1971
|
+
now,
|
|
1972
|
+
observedRunCount: observedRuns.length,
|
|
1973
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1974
|
+
clusters: clusterReport.clusters,
|
|
1975
|
+
actedOnCluster: actedOn,
|
|
1976
|
+
evolution,
|
|
1977
|
+
release,
|
|
1978
|
+
gate,
|
|
1979
|
+
promotedPrompt: promoted,
|
|
1980
|
+
pullRequest: null
|
|
1981
|
+
});
|
|
1982
|
+
}
|
|
1983
|
+
if (!opts.ship) {
|
|
1984
|
+
return finalize({
|
|
1985
|
+
opts,
|
|
1986
|
+
decision: "proposed_change",
|
|
1987
|
+
startedAt,
|
|
1988
|
+
now,
|
|
1989
|
+
observedRunCount: observedRuns.length,
|
|
1990
|
+
observedFeedbackCount: observedFeedback.length,
|
|
1991
|
+
clusters: clusterReport.clusters,
|
|
1992
|
+
actedOnCluster: actedOn,
|
|
1993
|
+
evolution,
|
|
1994
|
+
release,
|
|
1995
|
+
gate,
|
|
1996
|
+
promotedPrompt: promoted,
|
|
1997
|
+
pullRequest: null
|
|
1998
|
+
});
|
|
1999
|
+
}
|
|
2000
|
+
const baselineStr = toPromptString(baseline.payload);
|
|
2001
|
+
const promotedStr = toPromptString(promoted);
|
|
2002
|
+
const ctx = {
|
|
2003
|
+
runId: opts.runId,
|
|
2004
|
+
target: opts.target,
|
|
2005
|
+
decision: "pr_opened",
|
|
2006
|
+
clusters: clusterReport.clusters,
|
|
2007
|
+
actedOnCluster: actedOn,
|
|
2008
|
+
observedRunCount: observedRuns.length,
|
|
2009
|
+
observedFeedbackCount: observedFeedback.length,
|
|
2010
|
+
evolution,
|
|
2011
|
+
release,
|
|
2012
|
+
gate,
|
|
2013
|
+
baselinePromptString: baselineStr,
|
|
2014
|
+
promotedPromptString: promotedStr
|
|
2015
|
+
};
|
|
2016
|
+
const renderBody = opts.ship.renderBody ?? defaultRenderBody;
|
|
2017
|
+
const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
|
|
2018
|
+
`);
|
|
2019
|
+
const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
|
|
2020
|
+
const pr = await proposeAutomatedPullRequest(opts.ship.client, {
|
|
2021
|
+
repo: opts.ship.repo,
|
|
2022
|
+
baseBranch: opts.ship.baseBranch ?? "main",
|
|
2023
|
+
branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
|
|
2024
|
+
title: `${opts.target}: production-loop prompt update (${opts.runId})`,
|
|
2025
|
+
body: renderBody(ctx),
|
|
2026
|
+
reviewers: opts.ship.reviewers,
|
|
2027
|
+
labels: opts.ship.labels,
|
|
2028
|
+
fileChanges: [
|
|
2029
|
+
{
|
|
2030
|
+
path: opts.ship.promptFilePath,
|
|
2031
|
+
contents: renderFile(promotedStr, currentFile),
|
|
2032
|
+
rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
|
|
2033
|
+
}
|
|
2034
|
+
],
|
|
2035
|
+
dryRun: opts.ship.dryRun
|
|
2036
|
+
});
|
|
2037
|
+
return finalize({
|
|
2038
|
+
opts,
|
|
2039
|
+
decision: "pr_opened",
|
|
2040
|
+
startedAt,
|
|
2041
|
+
now,
|
|
2042
|
+
observedRunCount: observedRuns.length,
|
|
2043
|
+
observedFeedbackCount: observedFeedback.length,
|
|
2044
|
+
clusters: clusterReport.clusters,
|
|
2045
|
+
actedOnCluster: actedOn,
|
|
2046
|
+
evolution,
|
|
2047
|
+
release,
|
|
2048
|
+
gate,
|
|
2049
|
+
promotedPrompt: promoted,
|
|
2050
|
+
pullRequest: pr
|
|
2051
|
+
});
|
|
2052
|
+
}
|
|
2053
|
+
function finalize(args) {
|
|
2054
|
+
return {
|
|
2055
|
+
runId: args.opts.runId,
|
|
2056
|
+
target: args.opts.target,
|
|
2057
|
+
decision: args.decision,
|
|
2058
|
+
startedAt: args.startedAt,
|
|
2059
|
+
finishedAt: args.now().toISOString(),
|
|
2060
|
+
observedRunCount: args.observedRunCount,
|
|
2061
|
+
observedFeedbackCount: args.observedFeedbackCount,
|
|
2062
|
+
clusters: args.clusters,
|
|
2063
|
+
actedOnCluster: args.actedOnCluster,
|
|
2064
|
+
evolution: args.evolution,
|
|
2065
|
+
release: args.release,
|
|
2066
|
+
gate: args.gate,
|
|
2067
|
+
baselinePrompt: args.opts.evolve.baselinePrompt,
|
|
2068
|
+
promotedPrompt: args.promotedPrompt,
|
|
2069
|
+
pullRequest: args.pullRequest,
|
|
2070
|
+
cron: args.opts.cron ?? null
|
|
2071
|
+
};
|
|
2072
|
+
}
|
|
2073
|
+
function validate2(opts) {
|
|
2074
|
+
if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
|
|
2075
|
+
if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
|
|
2076
|
+
if (opts.evolve.holdoutScenarios.length === 0) {
|
|
2077
|
+
throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
|
|
2078
|
+
}
|
|
2079
|
+
if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
|
|
2080
|
+
throw new ValidationError(
|
|
2081
|
+
"runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
|
|
2082
|
+
);
|
|
2083
|
+
}
|
|
2084
|
+
if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
|
|
2085
|
+
}
|
|
2086
|
+
if (opts.ship) {
|
|
2087
|
+
if (!opts.ship.branchPrefix.trim()) {
|
|
2088
|
+
throw new ValidationError("runProductionLoop: ship.branchPrefix required");
|
|
2089
|
+
}
|
|
2090
|
+
if (!opts.ship.promptFilePath.trim()) {
|
|
2091
|
+
throw new ValidationError("runProductionLoop: ship.promptFilePath required");
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
}
|
|
2095
|
+
function uniqueIds(ids) {
|
|
2096
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2097
|
+
const out = [];
|
|
2098
|
+
for (const id of ids) {
|
|
2099
|
+
if (seen.has(id)) continue;
|
|
2100
|
+
seen.add(id);
|
|
2101
|
+
out.push(id);
|
|
2102
|
+
}
|
|
2103
|
+
return out;
|
|
2104
|
+
}
|
|
2105
|
+
function deriveSearchScenarios(holdout) {
|
|
2106
|
+
if (holdout.length < 4) {
|
|
2107
|
+
return [
|
|
2108
|
+
{
|
|
2109
|
+
...holdout[0],
|
|
2110
|
+
id: `${holdout[0].id}__search`
|
|
2111
|
+
}
|
|
2112
|
+
];
|
|
2113
|
+
}
|
|
2114
|
+
return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
|
|
2115
|
+
}
|
|
2116
|
+
function syntheticRunRecord(input) {
|
|
2117
|
+
const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
|
|
2118
|
+
return {
|
|
2119
|
+
runId: input.runId,
|
|
2120
|
+
experimentId: input.target,
|
|
2121
|
+
candidateId: input.variant.id,
|
|
2122
|
+
seed: input.seed,
|
|
2123
|
+
model: "production-loop@synthetic",
|
|
2124
|
+
promptHash: "0".repeat(64),
|
|
2125
|
+
configHash: "0".repeat(64),
|
|
2126
|
+
commitSha: "0".repeat(40),
|
|
2127
|
+
wallMs: input.trial.durationMs ?? 1,
|
|
2128
|
+
costUsd: input.trial.cost ?? 0,
|
|
2129
|
+
tokenUsage: { input: 0, output: 0 },
|
|
2130
|
+
outcome: {
|
|
2131
|
+
[scoreKey]: input.trial.score,
|
|
2132
|
+
raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
|
|
2133
|
+
},
|
|
2134
|
+
splitTag: input.split,
|
|
2135
|
+
scenarioId: input.scenarioId
|
|
2136
|
+
};
|
|
2137
|
+
}
|
|
2138
|
+
function toPromptString(payload) {
|
|
2139
|
+
if (typeof payload === "string") return payload;
|
|
2140
|
+
if (payload == null) return "";
|
|
2141
|
+
try {
|
|
2142
|
+
return JSON.stringify(payload, null, 2);
|
|
2143
|
+
} catch {
|
|
2144
|
+
return String(payload);
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
function defaultRenderBody(ctx) {
|
|
2148
|
+
const cluster = ctx.actedOnCluster;
|
|
2149
|
+
const release = ctx.release;
|
|
2150
|
+
const gate = ctx.gate;
|
|
2151
|
+
const lines = [];
|
|
2152
|
+
lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
|
|
2153
|
+
lines.push("");
|
|
2154
|
+
lines.push(`Run id: \`${ctx.runId}\``);
|
|
2155
|
+
lines.push(`Decision: \`${ctx.decision}\``);
|
|
2156
|
+
lines.push(
|
|
2157
|
+
`Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
|
|
2158
|
+
);
|
|
2159
|
+
lines.push("");
|
|
2160
|
+
if (cluster) {
|
|
2161
|
+
lines.push("### Triggering failure cluster");
|
|
2162
|
+
lines.push("");
|
|
2163
|
+
lines.push(`- **class**: \`${cluster.failureClass}\``);
|
|
2164
|
+
lines.push(`- **runs in cluster**: ${cluster.runCount}`);
|
|
2165
|
+
lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
|
|
2166
|
+
if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
|
|
2167
|
+
if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
|
|
2168
|
+
if (cluster.exampleError) {
|
|
2169
|
+
lines.push(
|
|
2170
|
+
`- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
|
|
2171
|
+
);
|
|
2172
|
+
}
|
|
2173
|
+
lines.push("");
|
|
2174
|
+
}
|
|
2175
|
+
if (gate) {
|
|
2176
|
+
lines.push("### Held-out promotion gate");
|
|
2177
|
+
lines.push("");
|
|
2178
|
+
lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
|
|
2179
|
+
lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
|
|
2180
|
+
lines.push(
|
|
2181
|
+
`- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
|
|
2182
|
+
);
|
|
2183
|
+
lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
|
|
2184
|
+
lines.push(
|
|
2185
|
+
`- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
|
|
2186
|
+
);
|
|
2187
|
+
lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
|
|
2188
|
+
lines.push("");
|
|
2189
|
+
}
|
|
2190
|
+
if (release) {
|
|
2191
|
+
lines.push("### Release confidence");
|
|
2192
|
+
lines.push("");
|
|
2193
|
+
lines.push(`- **status**: \`${release.status}\``);
|
|
2194
|
+
lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
|
|
2195
|
+
lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
|
|
2196
|
+
if (release.issues.length > 0) {
|
|
2197
|
+
lines.push("- **issues**:");
|
|
2198
|
+
for (const issue of release.issues) {
|
|
2199
|
+
lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
|
|
2200
|
+
}
|
|
2201
|
+
}
|
|
2202
|
+
lines.push("");
|
|
2203
|
+
}
|
|
2204
|
+
lines.push("### Prompt diff");
|
|
2205
|
+
lines.push("");
|
|
2206
|
+
lines.push("```diff");
|
|
2207
|
+
lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
|
|
2208
|
+
lines.push("```");
|
|
2209
|
+
return lines.join("\n");
|
|
2210
|
+
}
|
|
2211
|
+
function unifiedDiff(a, b) {
|
|
2212
|
+
const aLines = a.split("\n");
|
|
2213
|
+
const bLines = b.split("\n");
|
|
2214
|
+
const out = [];
|
|
2215
|
+
const max = Math.max(aLines.length, bLines.length);
|
|
2216
|
+
for (let i = 0; i < max; i++) {
|
|
2217
|
+
const al = aLines[i];
|
|
2218
|
+
const bl = bLines[i];
|
|
2219
|
+
if (al === bl) continue;
|
|
2220
|
+
if (al !== void 0) out.push(`- ${al}`);
|
|
2221
|
+
if (bl !== void 0) out.push(`+ ${bl}`);
|
|
2222
|
+
}
|
|
2223
|
+
return out.join("\n");
|
|
2224
|
+
}
|
|
2225
|
+
|
|
1537
2226
|
// src/registry.ts
|
|
1538
2227
|
var ScenarioRegistry = class {
|
|
1539
2228
|
scenarios = [];
|
|
@@ -2384,36 +3073,36 @@ var FileSystemExperimentStore = class {
|
|
|
2384
3073
|
return idx.listRuns(experimentId);
|
|
2385
3074
|
}
|
|
2386
3075
|
async ensureDir() {
|
|
2387
|
-
const
|
|
2388
|
-
await
|
|
3076
|
+
const fs2 = await import("fs/promises");
|
|
3077
|
+
await fs2.mkdir(this.dir, { recursive: true });
|
|
2389
3078
|
}
|
|
2390
3079
|
async append(name, record) {
|
|
2391
3080
|
await this.ensureDir();
|
|
2392
|
-
const
|
|
3081
|
+
const fs2 = await import("fs/promises");
|
|
2393
3082
|
const path = await import("path");
|
|
2394
3083
|
const active = path.join(this.dir, `${name}.ndjson`);
|
|
2395
3084
|
try {
|
|
2396
|
-
const stat = await
|
|
3085
|
+
const stat = await fs2.stat(active);
|
|
2397
3086
|
if (stat.size >= this.maxBytes) {
|
|
2398
3087
|
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
2399
|
-
await
|
|
3088
|
+
await fs2.rename(active, rolled);
|
|
2400
3089
|
}
|
|
2401
3090
|
} catch {
|
|
2402
3091
|
}
|
|
2403
|
-
await
|
|
3092
|
+
await fs2.appendFile(active, `${JSON.stringify(record)}
|
|
2404
3093
|
`, "utf8");
|
|
2405
3094
|
}
|
|
2406
3095
|
async load() {
|
|
2407
3096
|
if (this.loaded && this.index) return this.index;
|
|
2408
|
-
const
|
|
3097
|
+
const fs2 = await import("fs/promises");
|
|
2409
3098
|
const path = await import("path");
|
|
2410
3099
|
const store = new InMemoryExperimentStore();
|
|
2411
3100
|
try {
|
|
2412
|
-
const entries = await
|
|
3101
|
+
const entries = await fs2.readdir(this.dir);
|
|
2413
3102
|
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
2414
3103
|
for (const file of sorted) {
|
|
2415
3104
|
const full = path.join(this.dir, file);
|
|
2416
|
-
const content = await
|
|
3105
|
+
const content = await fs2.readFile(full, "utf8");
|
|
2417
3106
|
const base = file.split(".")[0];
|
|
2418
3107
|
for (const line of content.split("\n")) {
|
|
2419
3108
|
if (!line.trim()) continue;
|
|
@@ -4374,6 +5063,218 @@ function weightedKappa(a, b) {
|
|
|
4374
5063
|
if (den === 0) return 1;
|
|
4375
5064
|
return 1 - num / den;
|
|
4376
5065
|
}
|
|
5066
|
+
function continuousAgreement(scores, opts = {}) {
|
|
5067
|
+
const bootstrap = opts.bootstrap ?? 1e3;
|
|
5068
|
+
const weights = opts.weights ?? "quadratic";
|
|
5069
|
+
const seed = opts.seed ?? 12648430;
|
|
5070
|
+
const ciLevel = opts.ciLevel ?? 0.95;
|
|
5071
|
+
const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
|
|
5072
|
+
const raters = matrix[0]?.length ?? 0;
|
|
5073
|
+
const clean = matrix.filter((row) => row.length === raters);
|
|
5074
|
+
const nClean = clean.length;
|
|
5075
|
+
if (nClean < 2 || raters < 2) {
|
|
5076
|
+
return {
|
|
5077
|
+
weightedKappa: NaN,
|
|
5078
|
+
icc: NaN,
|
|
5079
|
+
pearson: NaN,
|
|
5080
|
+
spearman: NaN,
|
|
5081
|
+
ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
|
|
5082
|
+
n: nClean,
|
|
5083
|
+
raters
|
|
5084
|
+
};
|
|
5085
|
+
}
|
|
5086
|
+
const kappa = continuousWeightedKappa(clean, weights);
|
|
5087
|
+
const icc = icc21(clean);
|
|
5088
|
+
const pearson = avgPairwise(clean, pearsonR);
|
|
5089
|
+
const spearman = avgPairwise(clean, spearmanR);
|
|
5090
|
+
const ciIcc = [NaN, NaN];
|
|
5091
|
+
const ciKappa = [NaN, NaN];
|
|
5092
|
+
if (bootstrap > 0) {
|
|
5093
|
+
const rng = mulberry32(seed);
|
|
5094
|
+
const iccs = [];
|
|
5095
|
+
const kappas = [];
|
|
5096
|
+
for (let b = 0; b < bootstrap; b++) {
|
|
5097
|
+
const sample = new Array(nClean);
|
|
5098
|
+
for (let i = 0; i < nClean; i++) {
|
|
5099
|
+
sample[i] = clean[Math.floor(rng() * nClean)];
|
|
5100
|
+
}
|
|
5101
|
+
const iccB = icc21(sample);
|
|
5102
|
+
const kB = continuousWeightedKappa(sample, weights);
|
|
5103
|
+
if (Number.isFinite(iccB)) iccs.push(iccB);
|
|
5104
|
+
if (Number.isFinite(kB)) kappas.push(kB);
|
|
5105
|
+
}
|
|
5106
|
+
const [lo, hi] = percentileBounds(ciLevel);
|
|
5107
|
+
if (iccs.length > 0) {
|
|
5108
|
+
iccs.sort((a, b) => a - b);
|
|
5109
|
+
ciIcc[0] = quantile(iccs, lo);
|
|
5110
|
+
ciIcc[1] = quantile(iccs, hi);
|
|
5111
|
+
}
|
|
5112
|
+
if (kappas.length > 0) {
|
|
5113
|
+
kappas.sort((a, b) => a - b);
|
|
5114
|
+
ciKappa[0] = quantile(kappas, lo);
|
|
5115
|
+
ciKappa[1] = quantile(kappas, hi);
|
|
5116
|
+
}
|
|
5117
|
+
}
|
|
5118
|
+
return {
|
|
5119
|
+
weightedKappa: kappa,
|
|
5120
|
+
icc,
|
|
5121
|
+
pearson,
|
|
5122
|
+
spearman,
|
|
5123
|
+
ci: { icc: ciIcc, weightedKappa: ciKappa },
|
|
5124
|
+
n: nClean,
|
|
5125
|
+
raters
|
|
5126
|
+
};
|
|
5127
|
+
}
|
|
5128
|
+
function calibrateJudgeContinuous(golden, candidate, opts = {}) {
|
|
5129
|
+
const base = calibrateJudge(golden, candidate);
|
|
5130
|
+
const map = /* @__PURE__ */ new Map();
|
|
5131
|
+
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
5132
|
+
for (const c of candidate) {
|
|
5133
|
+
const entry = map.get(c.itemId);
|
|
5134
|
+
if (entry) entry.j = c.score;
|
|
5135
|
+
}
|
|
5136
|
+
const rows = [];
|
|
5137
|
+
for (const v of map.values()) {
|
|
5138
|
+
if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
|
|
5139
|
+
}
|
|
5140
|
+
const agreement = continuousAgreement(rows, opts);
|
|
5141
|
+
return {
|
|
5142
|
+
...base,
|
|
5143
|
+
weightedKappaContinuous: agreement.weightedKappa,
|
|
5144
|
+
icc: agreement.icc,
|
|
5145
|
+
spearman: agreement.spearman,
|
|
5146
|
+
ci: agreement.ci
|
|
5147
|
+
};
|
|
5148
|
+
}
|
|
5149
|
+
function continuousWeightedKappa(rows, scheme) {
|
|
5150
|
+
if (rows.length === 0) return NaN;
|
|
5151
|
+
const raters = rows[0].length;
|
|
5152
|
+
if (raters < 2) return NaN;
|
|
5153
|
+
const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
|
|
5154
|
+
let sum2 = 0;
|
|
5155
|
+
let pairs = 0;
|
|
5156
|
+
for (let r1 = 0; r1 < raters; r1++) {
|
|
5157
|
+
for (let r2 = r1 + 1; r2 < raters; r2++) {
|
|
5158
|
+
const a = rows.map((row) => row[r1]);
|
|
5159
|
+
const b = rows.map((row) => row[r2]);
|
|
5160
|
+
const n = a.length;
|
|
5161
|
+
let obs = 0;
|
|
5162
|
+
for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
|
|
5163
|
+
obs /= n;
|
|
5164
|
+
let exp = 0;
|
|
5165
|
+
for (let i = 0; i < n; i++) {
|
|
5166
|
+
for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
|
|
5167
|
+
}
|
|
5168
|
+
exp /= n * n;
|
|
5169
|
+
if (exp === 0) {
|
|
5170
|
+
sum2 += obs === 0 ? 1 : 0;
|
|
5171
|
+
} else {
|
|
5172
|
+
sum2 += 1 - obs / exp;
|
|
5173
|
+
}
|
|
5174
|
+
pairs++;
|
|
5175
|
+
}
|
|
5176
|
+
}
|
|
5177
|
+
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5178
|
+
}
|
|
5179
|
+
function icc21(rows) {
|
|
5180
|
+
const n = rows.length;
|
|
5181
|
+
if (n < 2) return NaN;
|
|
5182
|
+
const k = rows[0].length;
|
|
5183
|
+
if (k < 2) return NaN;
|
|
5184
|
+
const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
|
|
5185
|
+
const colMeans = new Array(k).fill(0);
|
|
5186
|
+
for (let j = 0; j < k; j++) {
|
|
5187
|
+
let s = 0;
|
|
5188
|
+
for (let i = 0; i < n; i++) s += rows[i][j];
|
|
5189
|
+
colMeans[j] = s / n;
|
|
5190
|
+
}
|
|
5191
|
+
let grand = 0;
|
|
5192
|
+
for (let i = 0; i < n; i++) grand += rowMeans[i];
|
|
5193
|
+
grand /= n;
|
|
5194
|
+
let ssR = 0;
|
|
5195
|
+
for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
|
|
5196
|
+
ssR *= k;
|
|
5197
|
+
let ssC = 0;
|
|
5198
|
+
for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
|
|
5199
|
+
ssC *= n;
|
|
5200
|
+
let ssT = 0;
|
|
5201
|
+
for (let i = 0; i < n; i++) {
|
|
5202
|
+
for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
|
|
5203
|
+
}
|
|
5204
|
+
const ssE = ssT - ssR - ssC;
|
|
5205
|
+
const dfR = n - 1;
|
|
5206
|
+
const dfC = k - 1;
|
|
5207
|
+
const dfE = (n - 1) * (k - 1);
|
|
5208
|
+
const msR = ssR / dfR;
|
|
5209
|
+
const msC = ssC / dfC;
|
|
5210
|
+
const msE = dfE > 0 ? ssE / dfE : 0;
|
|
5211
|
+
const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
|
|
5212
|
+
if (denom === 0) {
|
|
5213
|
+
return msR === 0 && msE === 0 ? 1 : 0;
|
|
5214
|
+
}
|
|
5215
|
+
return (msR - msE) / denom;
|
|
5216
|
+
}
|
|
5217
|
+
function avgPairwise(rows, fn) {
|
|
5218
|
+
const k = rows[0]?.length ?? 0;
|
|
5219
|
+
if (k < 2) return NaN;
|
|
5220
|
+
let sum2 = 0;
|
|
5221
|
+
let pairs = 0;
|
|
5222
|
+
for (let i = 0; i < k; i++) {
|
|
5223
|
+
for (let j = i + 1; j < k; j++) {
|
|
5224
|
+
const a = rows.map((row) => row[i]);
|
|
5225
|
+
const b = rows.map((row) => row[j]);
|
|
5226
|
+
const r = fn(a, b);
|
|
5227
|
+
if (Number.isFinite(r)) {
|
|
5228
|
+
sum2 += r;
|
|
5229
|
+
pairs++;
|
|
5230
|
+
}
|
|
5231
|
+
}
|
|
5232
|
+
}
|
|
5233
|
+
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5234
|
+
}
|
|
5235
|
+
function spearmanR(a, b) {
|
|
5236
|
+
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5237
|
+
return pearsonR(rankWithTies(a), rankWithTies(b));
|
|
5238
|
+
}
|
|
5239
|
+
function rankWithTies(xs) {
|
|
5240
|
+
const n = xs.length;
|
|
5241
|
+
const indexed = xs.map((v, i2) => ({ v, i: i2 }));
|
|
5242
|
+
indexed.sort((x, y) => x.v - y.v);
|
|
5243
|
+
const ranks = new Array(n).fill(0);
|
|
5244
|
+
let i = 0;
|
|
5245
|
+
while (i < n) {
|
|
5246
|
+
let j = i;
|
|
5247
|
+
while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
|
|
5248
|
+
const avg = (i + j) / 2 + 1;
|
|
5249
|
+
for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
|
|
5250
|
+
i = j + 1;
|
|
5251
|
+
}
|
|
5252
|
+
return ranks;
|
|
5253
|
+
}
|
|
5254
|
+
function mulberry32(seed) {
|
|
5255
|
+
let a = seed >>> 0;
|
|
5256
|
+
return () => {
|
|
5257
|
+
a = a + 1831565813 >>> 0;
|
|
5258
|
+
let t = a;
|
|
5259
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
5260
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
5261
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
5262
|
+
};
|
|
5263
|
+
}
|
|
5264
|
+
function percentileBounds(ciLevel) {
|
|
5265
|
+
const tail = (1 - ciLevel) / 2;
|
|
5266
|
+
return [tail, 1 - tail];
|
|
5267
|
+
}
|
|
5268
|
+
function quantile(sorted, q) {
|
|
5269
|
+
if (sorted.length === 0) return NaN;
|
|
5270
|
+
if (sorted.length === 1) return sorted[0];
|
|
5271
|
+
const pos = q * (sorted.length - 1);
|
|
5272
|
+
const lo = Math.floor(pos);
|
|
5273
|
+
const hi = Math.ceil(pos);
|
|
5274
|
+
if (lo === hi) return sorted[lo];
|
|
5275
|
+
const frac = pos - lo;
|
|
5276
|
+
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
5277
|
+
}
|
|
4377
5278
|
|
|
4378
5279
|
// src/observability.ts
|
|
4379
5280
|
async function toLangfuseEnvelope(store, runId) {
|
|
@@ -4875,7 +5776,7 @@ async function commitBisect(options) {
|
|
|
4875
5776
|
}
|
|
4876
5777
|
async function promptBisect(options) {
|
|
4877
5778
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
4878
|
-
const
|
|
5779
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
4879
5780
|
const goodParas = split(options.good);
|
|
4880
5781
|
const badParas = split(options.bad);
|
|
4881
5782
|
if (goodParas.length !== badParas.length) {
|
|
@@ -4895,7 +5796,7 @@ async function promptBisect(options) {
|
|
|
4895
5796
|
const result = await bisect({
|
|
4896
5797
|
good: goodMask,
|
|
4897
5798
|
bad: badMask,
|
|
4898
|
-
runEval: (mask) => options.runEval(
|
|
5799
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
4899
5800
|
maxIterations: options.maxIterations ?? n + 5,
|
|
4900
5801
|
halfway: (g, b) => {
|
|
4901
5802
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -4926,12 +5827,12 @@ async function promptBisect(options) {
|
|
|
4926
5827
|
}
|
|
4927
5828
|
}
|
|
4928
5829
|
const materializedPath = result.path.map((s) => ({
|
|
4929
|
-
state:
|
|
5830
|
+
state: join4(paragraphsFor(s.state)),
|
|
4930
5831
|
score: s.score,
|
|
4931
5832
|
pass: s.pass
|
|
4932
5833
|
}));
|
|
4933
5834
|
return {
|
|
4934
|
-
culprit:
|
|
5835
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
4935
5836
|
path: materializedPath,
|
|
4936
5837
|
converged: result.converged,
|
|
4937
5838
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -5176,7 +6077,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
5176
6077
|
runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
|
|
5177
6078
|
}
|
|
5178
6079
|
const runCounts = [...runCountByScenario.values()];
|
|
5179
|
-
const p25 = runCounts.length > 0 ?
|
|
6080
|
+
const p25 = runCounts.length > 0 ? quantile2(runCounts, 0.25) : 0;
|
|
5180
6081
|
for (const s of scenarios) {
|
|
5181
6082
|
const count = runCountByScenario.get(s.id) ?? 0;
|
|
5182
6083
|
if (count <= p25 && count < 3) {
|
|
@@ -5230,7 +6131,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
5230
6131
|
}
|
|
5231
6132
|
return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
|
|
5232
6133
|
}
|
|
5233
|
-
function
|
|
6134
|
+
function quantile2(xs, p) {
|
|
5234
6135
|
const sorted = [...xs].sort((a, b) => a - b);
|
|
5235
6136
|
const idx = p * (sorted.length - 1);
|
|
5236
6137
|
const lo = Math.floor(idx);
|
|
@@ -7619,6 +8520,52 @@ function createCompositeMutator(opts) {
|
|
|
7619
8520
|
};
|
|
7620
8521
|
}
|
|
7621
8522
|
|
|
8523
|
+
// src/discover-personas.ts
|
|
8524
|
+
import { promises as fs } from "fs";
|
|
8525
|
+
import { basename, extname, join as join3 } from "path";
|
|
8526
|
+
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
8527
|
+
async function discoverPersonas(dir, opts = {}) {
|
|
8528
|
+
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
8529
|
+
const exclude = new Set(opts.exclude ?? []);
|
|
8530
|
+
const include = opts.include;
|
|
8531
|
+
async function walk(d) {
|
|
8532
|
+
let entries;
|
|
8533
|
+
try {
|
|
8534
|
+
const raw = await fs.readdir(d, { withFileTypes: true });
|
|
8535
|
+
entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
|
|
8536
|
+
} catch (err) {
|
|
8537
|
+
const code = err.code;
|
|
8538
|
+
if (code === "ENOENT") return [];
|
|
8539
|
+
throw err;
|
|
8540
|
+
}
|
|
8541
|
+
const out = [];
|
|
8542
|
+
for (const entry of entries) {
|
|
8543
|
+
const full = join3(d, entry.name);
|
|
8544
|
+
if (entry.isDir) {
|
|
8545
|
+
if (opts.recursive) out.push(...await walk(full));
|
|
8546
|
+
continue;
|
|
8547
|
+
}
|
|
8548
|
+
if (!pattern.test(entry.name)) continue;
|
|
8549
|
+
if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
|
|
8550
|
+
continue;
|
|
8551
|
+
if (include && include.length > 0) {
|
|
8552
|
+
const id = basename(entry.name, extname(entry.name));
|
|
8553
|
+
const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
|
|
8554
|
+
if (!matched) continue;
|
|
8555
|
+
}
|
|
8556
|
+
out.push({
|
|
8557
|
+
path: full,
|
|
8558
|
+
filename: entry.name,
|
|
8559
|
+
id: basename(entry.name, extname(entry.name))
|
|
8560
|
+
});
|
|
8561
|
+
}
|
|
8562
|
+
return out;
|
|
8563
|
+
}
|
|
8564
|
+
const results = await walk(dir);
|
|
8565
|
+
results.sort((a, b) => a.filename.localeCompare(b.filename));
|
|
8566
|
+
return results;
|
|
8567
|
+
}
|
|
8568
|
+
|
|
7622
8569
|
// src/evolution-telemetry.ts
|
|
7623
8570
|
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
|
|
7624
8571
|
import { dirname as dirname3 } from "path";
|
|
@@ -8008,6 +8955,90 @@ var JsonlTrialCache = class {
|
|
|
8008
8955
|
}
|
|
8009
8956
|
};
|
|
8010
8957
|
|
|
8958
|
+
// src/judge-retry.ts
|
|
8959
|
+
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
8960
|
+
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
8961
|
+
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
8962
|
+
var ABORT_PATTERNS = [
|
|
8963
|
+
/AbortError/i,
|
|
8964
|
+
/TimeoutError/i,
|
|
8965
|
+
/fetch failed/i,
|
|
8966
|
+
/ECONNRESET/i,
|
|
8967
|
+
/ETIMEDOUT/i,
|
|
8968
|
+
/EAI_AGAIN/i,
|
|
8969
|
+
/this operation was aborted/i,
|
|
8970
|
+
/stream.*ended.*unexpectedly/i,
|
|
8971
|
+
/socket hang up/i
|
|
8972
|
+
];
|
|
8973
|
+
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8974
|
+
function defaultIsRetryable(err) {
|
|
8975
|
+
if (err instanceof Error) {
|
|
8976
|
+
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
8977
|
+
const status = err.status;
|
|
8978
|
+
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
8979
|
+
}
|
|
8980
|
+
return false;
|
|
8981
|
+
}
|
|
8982
|
+
function sleep(ms) {
|
|
8983
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8984
|
+
}
|
|
8985
|
+
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
8986
|
+
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
8987
|
+
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8988
|
+
const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
|
|
8989
|
+
const isRetryable = policy.isRetryable ?? defaultIsRetryable;
|
|
8990
|
+
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
8991
|
+
let totalAttempts = 0;
|
|
8992
|
+
const attemptErrors = [];
|
|
8993
|
+
let lastError;
|
|
8994
|
+
for (const model of models) {
|
|
8995
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
8996
|
+
totalAttempts += 1;
|
|
8997
|
+
const controller = new AbortController();
|
|
8998
|
+
const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
|
|
8999
|
+
try {
|
|
9000
|
+
const value = await judgeFn(model, controller.signal);
|
|
9001
|
+
clearTimeout(timer);
|
|
9002
|
+
return {
|
|
9003
|
+
value,
|
|
9004
|
+
succeeded: true,
|
|
9005
|
+
attempts: totalAttempts,
|
|
9006
|
+
modelUsed: model,
|
|
9007
|
+
attemptErrors
|
|
9008
|
+
};
|
|
9009
|
+
} catch (err) {
|
|
9010
|
+
clearTimeout(timer);
|
|
9011
|
+
const errObj = err instanceof Error ? err : new Error(String(err));
|
|
9012
|
+
lastError = errObj;
|
|
9013
|
+
attemptErrors.push({
|
|
9014
|
+
attempt: totalAttempts,
|
|
9015
|
+
model: model ?? "(default)",
|
|
9016
|
+
error: errObj.message
|
|
9017
|
+
});
|
|
9018
|
+
if (!isRetryable(errObj)) {
|
|
9019
|
+
return {
|
|
9020
|
+
value: null,
|
|
9021
|
+
succeeded: false,
|
|
9022
|
+
attempts: totalAttempts,
|
|
9023
|
+
error: errObj,
|
|
9024
|
+
attemptErrors
|
|
9025
|
+
};
|
|
9026
|
+
}
|
|
9027
|
+
if (attempt < maxAttempts - 1) {
|
|
9028
|
+
await sleep(backoff(attempt));
|
|
9029
|
+
}
|
|
9030
|
+
}
|
|
9031
|
+
}
|
|
9032
|
+
}
|
|
9033
|
+
return {
|
|
9034
|
+
value: null,
|
|
9035
|
+
succeeded: false,
|
|
9036
|
+
attempts: totalAttempts,
|
|
9037
|
+
error: lastError,
|
|
9038
|
+
attemptErrors
|
|
9039
|
+
};
|
|
9040
|
+
}
|
|
9041
|
+
|
|
8011
9042
|
// src/orthogonality.ts
|
|
8012
9043
|
function passOrthogonality(input) {
|
|
8013
9044
|
const passes = input.passes;
|
|
@@ -8225,6 +9256,55 @@ function createSandboxPool(opts) {
|
|
|
8225
9256
|
utilization
|
|
8226
9257
|
};
|
|
8227
9258
|
}
|
|
9259
|
+
|
|
9260
|
+
// src/trial-aggregator.ts
|
|
9261
|
+
function meanOf(xs) {
|
|
9262
|
+
if (xs.length === 0) return 0;
|
|
9263
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
9264
|
+
}
|
|
9265
|
+
function meanMetrics(rows) {
|
|
9266
|
+
if (rows.length === 0) return {};
|
|
9267
|
+
const keys = /* @__PURE__ */ new Set();
|
|
9268
|
+
for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
|
|
9269
|
+
const out = {};
|
|
9270
|
+
for (const k of keys) {
|
|
9271
|
+
const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
|
|
9272
|
+
if (xs.length > 0) out[k] = meanOf(xs);
|
|
9273
|
+
}
|
|
9274
|
+
return out;
|
|
9275
|
+
}
|
|
9276
|
+
function aggregateTrialsByMode(trials, opts) {
|
|
9277
|
+
const gradedTrials = trials.filter((t) => !t.error);
|
|
9278
|
+
const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
|
|
9279
|
+
const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
|
|
9280
|
+
if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
|
|
9281
|
+
return {
|
|
9282
|
+
meanScore: 0,
|
|
9283
|
+
meanCost: 0,
|
|
9284
|
+
meanDurationMs: 0,
|
|
9285
|
+
okRate: 0,
|
|
9286
|
+
countedTrials: 0,
|
|
9287
|
+
excludedFailedTrials: judgeFailed.length,
|
|
9288
|
+
totalTrials: trials.length,
|
|
9289
|
+
metrics: {},
|
|
9290
|
+
strictFailure: {
|
|
9291
|
+
failedCount: judgeFailed.length,
|
|
9292
|
+
firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
|
|
9293
|
+
}
|
|
9294
|
+
};
|
|
9295
|
+
}
|
|
9296
|
+
const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
|
|
9297
|
+
return {
|
|
9298
|
+
meanScore: meanOf(counted.map((t) => t.score)),
|
|
9299
|
+
meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
|
|
9300
|
+
meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
|
|
9301
|
+
okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
|
|
9302
|
+
countedTrials: counted.length,
|
|
9303
|
+
excludedFailedTrials: judgeFailed.length,
|
|
9304
|
+
totalTrials: trials.length,
|
|
9305
|
+
metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
|
|
9306
|
+
};
|
|
9307
|
+
}
|
|
8228
9308
|
export {
|
|
8229
9309
|
AgentDriver,
|
|
8230
9310
|
AgentEvalError,
|
|
@@ -8314,6 +9394,7 @@ export {
|
|
|
8314
9394
|
adversarialJudge,
|
|
8315
9395
|
aggregateLlm,
|
|
8316
9396
|
aggregateRunScore,
|
|
9397
|
+
aggregateTrialsByMode,
|
|
8317
9398
|
allCriticalPassed,
|
|
8318
9399
|
analyzeAntiSlop,
|
|
8319
9400
|
analyzeSeries,
|
|
@@ -8336,6 +9417,7 @@ export {
|
|
|
8336
9417
|
buildTrajectory,
|
|
8337
9418
|
byteLengthRange,
|
|
8338
9419
|
calibrateJudge,
|
|
9420
|
+
calibrateJudgeContinuous,
|
|
8339
9421
|
callLlm,
|
|
8340
9422
|
callLlmJson,
|
|
8341
9423
|
canaryLeakView,
|
|
@@ -8360,6 +9442,7 @@ export {
|
|
|
8360
9442
|
computeToolUseMetrics,
|
|
8361
9443
|
confidenceInterval,
|
|
8362
9444
|
containsAll,
|
|
9445
|
+
continuousAgreement,
|
|
8363
9446
|
controlFailureClassFromVerification,
|
|
8364
9447
|
controlRunToFeedbackTrajectory,
|
|
8365
9448
|
controlRunToRunRecord,
|
|
@@ -8384,6 +9467,7 @@ export {
|
|
|
8384
9467
|
defaultProviderRedactor,
|
|
8385
9468
|
defaultReferenceReplayMatcher,
|
|
8386
9469
|
deployGateLayer,
|
|
9470
|
+
discoverPersonas,
|
|
8387
9471
|
distillPlaybook,
|
|
8388
9472
|
dominates,
|
|
8389
9473
|
estimateCost,
|
|
@@ -8417,6 +9501,7 @@ export {
|
|
|
8417
9501
|
formatDriverReport,
|
|
8418
9502
|
formatFindings,
|
|
8419
9503
|
gainHistogram,
|
|
9504
|
+
ghCliClient,
|
|
8420
9505
|
precision as goldenPrecision,
|
|
8421
9506
|
gradeSemanticStatus,
|
|
8422
9507
|
groupBy,
|
|
@@ -8424,6 +9509,7 @@ export {
|
|
|
8424
9509
|
hashJson,
|
|
8425
9510
|
hashScenarios,
|
|
8426
9511
|
htmlContainsElement,
|
|
9512
|
+
httpGithubClient,
|
|
8427
9513
|
inMemoryReferenceReplayStore,
|
|
8428
9514
|
inMemoryReviewStore,
|
|
8429
9515
|
integrationAsi,
|
|
@@ -8484,6 +9570,7 @@ export {
|
|
|
8484
9570
|
printDriverSummary,
|
|
8485
9571
|
probeLlm,
|
|
8486
9572
|
promptBisect,
|
|
9573
|
+
proposeAutomatedPullRequest,
|
|
8487
9574
|
proposeSynthesisTargets,
|
|
8488
9575
|
providerFromBaseUrl,
|
|
8489
9576
|
pytestTestParser,
|
|
@@ -8528,6 +9615,7 @@ export {
|
|
|
8528
9615
|
runKeywordCoverageJudgeUrl,
|
|
8529
9616
|
runLiveProof,
|
|
8530
9617
|
runMultiShotOptimization,
|
|
9618
|
+
runProductionLoop,
|
|
8531
9619
|
runPromptEvolution,
|
|
8532
9620
|
runProposeReview,
|
|
8533
9621
|
runProposeReviewAsControlLoop,
|
|
@@ -8582,6 +9670,7 @@ export {
|
|
|
8582
9670
|
whitespaceCollapseMutator,
|
|
8583
9671
|
wilcoxonSignedRank,
|
|
8584
9672
|
withAssignedFeedbackSplit,
|
|
9673
|
+
withJudgeRetry,
|
|
8585
9674
|
wranglerDeployRunner
|
|
8586
9675
|
};
|
|
8587
9676
|
//# sourceMappingURL=index.js.map
|