@tangle-network/agent-eval 0.24.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,9 +8,10 @@ import {
8
8
  classifyFailure,
9
9
  compareToBaseline,
10
10
  computeToolUseMetrics,
11
+ failureClusterView,
11
12
  iqr,
12
13
  welchsTTest
13
- } from "./chunk-OHEPNJQN.js";
14
+ } from "./chunk-JLZQWFV3.js";
14
15
  import {
15
16
  exportTrainingData,
16
17
  toNdjson
@@ -95,7 +96,7 @@ import {
95
96
  summarizePreferenceMemory,
96
97
  trialTraceFromMultiShotTrial,
97
98
  withAssignedFeedbackSplit
98
- } from "./chunk-VRJVTXRV.js";
99
+ } from "./chunk-WHZMVFUV.js";
99
100
  import {
100
101
  RunRecordValidationError,
101
102
  isRunRecord,
@@ -220,6 +221,304 @@ import {
220
221
  } from "./chunk-NG236HPC.js";
221
222
  import "./chunk-PZ5AY32C.js";
222
223
 
224
+ // src/auto-pr.ts
225
+ async function proposeAutomatedPullRequest(client, input) {
226
+ validate(input);
227
+ return client.proposeChange(input);
228
+ }
229
+ function validate(input) {
230
+ if (!input.repo.owner.trim() || !input.repo.name.trim()) {
231
+ throw new ValidationError("proposeAutomatedPullRequest: repo.owner and repo.name required");
232
+ }
233
+ if (!input.branchName.trim() || /\s/.test(input.branchName)) {
234
+ throw new ValidationError(
235
+ "proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace"
236
+ );
237
+ }
238
+ if (input.branchName === (input.baseBranch ?? "main")) {
239
+ throw new ValidationError("proposeAutomatedPullRequest: branchName must differ from baseBranch");
240
+ }
241
+ if (input.fileChanges.length === 0) {
242
+ throw new ValidationError("proposeAutomatedPullRequest: fileChanges must not be empty");
243
+ }
244
+ const seenPaths = /* @__PURE__ */ new Set();
245
+ for (const change of input.fileChanges) {
246
+ if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
247
+ throw new ValidationError(
248
+ `proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
249
+ );
250
+ }
251
+ if (seenPaths.has(change.path)) {
252
+ throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
253
+ }
254
+ seenPaths.add(change.path);
255
+ }
256
+ if (!input.title.trim()) {
257
+ throw new ValidationError("proposeAutomatedPullRequest: title must not be empty");
258
+ }
259
+ }
260
+ function httpGithubClient(opts) {
261
+ const fetchImpl = opts.fetchImpl ?? fetch;
262
+ const apiBase = (opts.apiBase ?? "https://api.github.com").replace(/\/+$/, "");
263
+ const now = opts.now ?? (() => /* @__PURE__ */ new Date());
264
+ async function api(method, path, body, accept404 = false) {
265
+ const res = await fetchImpl(`${apiBase}${path}`, {
266
+ method,
267
+ headers: {
268
+ accept: "application/vnd.github+json",
269
+ "content-type": "application/json",
270
+ authorization: `Bearer ${opts.token}`,
271
+ "x-github-api-version": "2022-11-28"
272
+ },
273
+ body: body === void 0 ? void 0 : JSON.stringify(body)
274
+ });
275
+ if (accept404 && res.status === 404) return null;
276
+ if (!res.ok) {
277
+ const text = await res.text().catch(() => "");
278
+ throw new ConfigError(
279
+ `proposeAutomatedPullRequest: GitHub ${method} ${path} \u2192 ${res.status} ${text.slice(0, 400)}`
280
+ );
281
+ }
282
+ return await res.json();
283
+ }
284
+ return {
285
+ async proposeChange(input) {
286
+ const baseBranch = input.baseBranch ?? "main";
287
+ const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`;
288
+ if (input.dryRun) {
289
+ return {
290
+ prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
291
+ branchName: input.branchName,
292
+ headSha: "dry-run",
293
+ dryRun: true
294
+ };
295
+ }
296
+ const baseRef = await api("GET", `${repoPath}/git/ref/heads/${baseBranch}`);
297
+ if (!baseRef) {
298
+ throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`);
299
+ }
300
+ const baseSha = baseRef.object.sha;
301
+ const baseCommit = await api("GET", `${repoPath}/git/commits/${baseSha}`);
302
+ if (!baseCommit) {
303
+ throw new ConfigError(
304
+ `proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`
305
+ );
306
+ }
307
+ const treeEntries = [];
308
+ for (const change of input.fileChanges) {
309
+ const blob = await api("POST", `${repoPath}/git/blobs`, {
310
+ content: change.contents,
311
+ encoding: "utf-8"
312
+ });
313
+ if (!blob) throw new ConfigError("proposeAutomatedPullRequest: blob creation returned null");
314
+ treeEntries.push({
315
+ path: change.path,
316
+ mode: "100644",
317
+ type: "blob",
318
+ sha: blob.sha
319
+ });
320
+ }
321
+ const tree = await api("POST", `${repoPath}/git/trees`, {
322
+ base_tree: baseCommit.tree.sha,
323
+ tree: treeEntries
324
+ });
325
+ if (!tree) throw new ConfigError("proposeAutomatedPullRequest: tree creation returned null");
326
+ const author = input.authorName && input.authorEmail ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() } : void 0;
327
+ const commitMessage = renderCommitMessage(input);
328
+ const commit = await api("POST", `${repoPath}/git/commits`, {
329
+ message: commitMessage,
330
+ tree: tree.sha,
331
+ parents: [baseSha],
332
+ ...author ? { author, committer: author } : {}
333
+ });
334
+ if (!commit)
335
+ throw new ConfigError("proposeAutomatedPullRequest: commit creation returned null");
336
+ const existing = await api(
337
+ "GET",
338
+ `${repoPath}/git/ref/heads/${input.branchName}`,
339
+ void 0,
340
+ true
341
+ );
342
+ if (!existing) {
343
+ await api("POST", `${repoPath}/git/refs`, {
344
+ ref: `refs/heads/${input.branchName}`,
345
+ sha: commit.sha
346
+ });
347
+ } else if (existing.object.sha !== commit.sha) {
348
+ await api("PATCH", `${repoPath}/git/refs/heads/${input.branchName}`, {
349
+ sha: commit.sha,
350
+ force: true
351
+ });
352
+ }
353
+ const openPrs = await api(
354
+ "GET",
355
+ `${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`
356
+ );
357
+ let pr;
358
+ if (openPrs && openPrs.length > 0) {
359
+ pr = openPrs[0];
360
+ } else {
361
+ const created = await api("POST", `${repoPath}/pulls`, {
362
+ title: input.title,
363
+ body: input.body,
364
+ head: input.branchName,
365
+ base: baseBranch
366
+ });
367
+ if (!created)
368
+ throw new ConfigError("proposeAutomatedPullRequest: PR creation returned null");
369
+ pr = created;
370
+ }
371
+ if (input.reviewers && input.reviewers.length > 0) {
372
+ await api(
373
+ "POST",
374
+ `${repoPath}/pulls/${pr.number}/requested_reviewers`,
375
+ { reviewers: input.reviewers },
376
+ true
377
+ ).catch(() => {
378
+ });
379
+ }
380
+ if (input.labels && input.labels.length > 0) {
381
+ await api(
382
+ "POST",
383
+ `${repoPath}/issues/${pr.number}/labels`,
384
+ { labels: input.labels },
385
+ true
386
+ ).catch(() => {
387
+ });
388
+ }
389
+ return {
390
+ prUrl: pr.html_url,
391
+ branchName: input.branchName,
392
+ headSha: commit.sha,
393
+ dryRun: false
394
+ };
395
+ }
396
+ };
397
+ }
398
+ function ghCliClient(opts = {}) {
399
+ const bin = opts.bin ?? "gh";
400
+ const cwd = opts.cwd ?? process.cwd();
401
+ const exec = opts.exec ?? defaultExec;
402
+ async function run(cmd, args, stdin) {
403
+ const r = await exec(cmd, args, { cwd, stdin });
404
+ if (r.exitCode !== 0) {
405
+ throw new ConfigError(
406
+ `proposeAutomatedPullRequest: ${cmd} ${args.join(" ")} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`
407
+ );
408
+ }
409
+ return r;
410
+ }
411
+ return {
412
+ async proposeChange(input) {
413
+ const baseBranch = input.baseBranch ?? "main";
414
+ if (input.dryRun) {
415
+ return {
416
+ prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
417
+ branchName: input.branchName,
418
+ headSha: "dry-run",
419
+ dryRun: true
420
+ };
421
+ }
422
+ await run("git", ["fetch", "origin", baseBranch]);
423
+ await run("git", ["checkout", baseBranch]);
424
+ await run("git", ["reset", "--hard", `origin/${baseBranch}`]);
425
+ await exec("git", ["branch", "-D", input.branchName], { cwd });
426
+ await run("git", ["checkout", "-b", input.branchName]);
427
+ const { mkdir, writeFile } = await import("fs/promises");
428
+ const { dirname: dirname5, join: join4, resolve } = await import("path");
429
+ for (const change of input.fileChanges) {
430
+ const abs = resolve(cwd, change.path);
431
+ await mkdir(dirname5(abs), { recursive: true });
432
+ await writeFile(abs, change.contents, "utf8");
433
+ await run("git", ["add", join4(change.path)]);
434
+ }
435
+ const env = {};
436
+ if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
437
+ if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail;
438
+ if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName;
439
+ if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail;
440
+ const message = renderCommitMessage(input);
441
+ await run("git", ["commit", "-m", message]);
442
+ const headRes = await run("git", ["rev-parse", "HEAD"]);
443
+ const headSha = headRes.stdout.trim();
444
+ await run("git", ["push", "-f", "origin", input.branchName]);
445
+ const existing = await exec(
446
+ bin,
447
+ [
448
+ "pr",
449
+ "list",
450
+ "--state",
451
+ "open",
452
+ "--head",
453
+ input.branchName,
454
+ "--json",
455
+ "url,number",
456
+ "--limit",
457
+ "1"
458
+ ],
459
+ { cwd }
460
+ );
461
+ let prUrl = "";
462
+ if (existing.exitCode === 0 && existing.stdout.trim()) {
463
+ const parsed = JSON.parse(existing.stdout);
464
+ if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url;
465
+ }
466
+ if (!prUrl) {
467
+ const args = [
468
+ "pr",
469
+ "create",
470
+ "--title",
471
+ input.title,
472
+ "--body",
473
+ input.body,
474
+ "--base",
475
+ baseBranch
476
+ ];
477
+ if (input.reviewers && input.reviewers.length > 0) {
478
+ args.push("--reviewer", input.reviewers.join(","));
479
+ }
480
+ if (input.labels && input.labels.length > 0) {
481
+ args.push("--label", input.labels.join(","));
482
+ }
483
+ const r = await run(bin, args);
484
+ const match = r.stdout.match(/https?:\/\/\S+/);
485
+ prUrl = match ? match[0] : r.stdout.trim();
486
+ }
487
+ return { prUrl, branchName: input.branchName, headSha, dryRun: false };
488
+ }
489
+ };
490
+ }
491
+ async function defaultExec(bin, args, opts) {
492
+ const { spawn } = await import("child_process");
493
+ return new Promise((resolveExec) => {
494
+ const child = spawn(bin, args, { cwd: opts.cwd });
495
+ let stdout = "";
496
+ let stderr = "";
497
+ child.stdout.on("data", (d) => {
498
+ stdout += d.toString();
499
+ });
500
+ child.stderr.on("data", (d) => {
501
+ stderr += d.toString();
502
+ });
503
+ if (opts.stdin) child.stdin.end(opts.stdin);
504
+ child.on("error", (err) => {
505
+ resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 });
506
+ });
507
+ child.on("close", (code) => {
508
+ resolveExec({ stdout, stderr, exitCode: code ?? 1 });
509
+ });
510
+ });
511
+ }
512
+ function renderCommitMessage(input) {
513
+ const lines = [input.title, ""];
514
+ for (const change of input.fileChanges) {
515
+ if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`);
516
+ }
517
+ if (lines[lines.length - 1] !== "") lines.push("");
518
+ lines.push(input.body.trim());
519
+ return lines.join("\n").trim();
520
+ }
521
+
223
522
  // src/executor.ts
224
523
  async function executeScenario(tc, scenario, config) {
225
524
  const startTime = Date.now();
@@ -1534,6 +1833,396 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
1534
1833
  };
1535
1834
  }
1536
1835
 
1836
+ // src/production-loop.ts
1837
+ async function runProductionLoop(opts) {
1838
+ validate2(opts);
1839
+ const now = opts.now ?? (() => /* @__PURE__ */ new Date());
1840
+ const startedAt = now().toISOString();
1841
+ const observedRuns = await opts.traceStore.listRuns();
1842
+ const observedFeedback = await opts.feedbackStore.list();
1843
+ const clusterReport = await failureClusterView(opts.traceStore, {
1844
+ minClusterSize: opts.cluster.minClusterSize ?? 1
1845
+ });
1846
+ const minSize = opts.cluster.minClusterSize ?? 5;
1847
+ const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
1848
+ const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
1849
+ const totalRuns = clusterReport.totalRuns;
1850
+ const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
1851
+ if (actionable.length === 0) {
1852
+ return finalize({
1853
+ opts,
1854
+ decision: "no_actionable_failures",
1855
+ startedAt,
1856
+ now,
1857
+ observedRunCount: observedRuns.length,
1858
+ observedFeedbackCount: observedFeedback.length,
1859
+ clusters: clusterReport.clusters,
1860
+ actedOnCluster: null,
1861
+ evolution: null,
1862
+ release: null,
1863
+ gate: null,
1864
+ promotedPrompt: opts.evolve.baselinePrompt,
1865
+ pullRequest: null
1866
+ });
1867
+ }
1868
+ const actedOn = actionable[0];
1869
+ const baseline = {
1870
+ id: opts.evolve.baselineId ?? "baseline",
1871
+ label: opts.evolve.baselineId ?? "baseline",
1872
+ generation: 0,
1873
+ payload: opts.evolve.baselinePrompt
1874
+ };
1875
+ const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
1876
+ const searchIds = uniqueIds(
1877
+ (opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
1878
+ (s) => s.id
1879
+ )
1880
+ );
1881
+ if (searchIds.some((id) => holdoutIds.includes(id))) {
1882
+ throw new ValidationError(
1883
+ "runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
1884
+ );
1885
+ }
1886
+ const reps = opts.evolve.reps ?? 3;
1887
+ const generations = opts.evolve.generations ?? 3;
1888
+ const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
1889
+ const evolution = await runMultiShotOptimization({
1890
+ runId: `${opts.runId}/evolve`,
1891
+ target: opts.target,
1892
+ seedVariants: [baseline],
1893
+ searchScenarioIds: searchIds,
1894
+ reps,
1895
+ generations,
1896
+ populationSize,
1897
+ scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
1898
+ runner: opts.evolve.runner,
1899
+ scorer: opts.evolve.scorer,
1900
+ mutateAdapter: opts.evolve.mutator,
1901
+ gate: {
1902
+ holdoutScenarioIds: holdoutIds,
1903
+ reps,
1904
+ gate: { ...opts.evolve.gate, baselineKey: baseline.id },
1905
+ toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
1906
+ runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
1907
+ variant,
1908
+ scenarioId,
1909
+ rep,
1910
+ split,
1911
+ seed,
1912
+ trial,
1913
+ target: opts.target
1914
+ }))
1915
+ }
1916
+ });
1917
+ const gate = evolution.gate?.decision ?? null;
1918
+ const promotedVariant = evolution.promotedVariant;
1919
+ const promoted = promotedVariant.payload;
1920
+ const promotedChanged = promotedVariant.id !== baseline.id;
1921
+ const allTrials = evolution.evolution.generations.flatMap(
1922
+ (g) => g.trials
1923
+ );
1924
+ const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
1925
+ const releaseScenarios = [
1926
+ ...(opts.evolve.searchScenarios ?? []).map((s) => ({
1927
+ id: s.id,
1928
+ payload: s,
1929
+ split: "train",
1930
+ tags: { persona: s.persona, label: s.label }
1931
+ })),
1932
+ ...opts.evolve.holdoutScenarios.map((s) => ({
1933
+ id: s.id,
1934
+ payload: s,
1935
+ split: "holdout",
1936
+ tags: { persona: s.persona, label: s.label }
1937
+ }))
1938
+ ];
1939
+ const release = evaluateReleaseConfidence({
1940
+ target: opts.target,
1941
+ candidateId: promotedVariant.id,
1942
+ baselineId: baseline.id,
1943
+ scenarios: releaseScenarios,
1944
+ traces: traceEvidence,
1945
+ gateDecision: gate ?? void 0,
1946
+ thresholds: opts.releaseThresholds,
1947
+ runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
1948
+ });
1949
+ if (!promotedChanged) {
1950
+ return finalize({
1951
+ opts,
1952
+ decision: "evolve_yielded_no_improvement",
1953
+ startedAt,
1954
+ now,
1955
+ observedRunCount: observedRuns.length,
1956
+ observedFeedbackCount: observedFeedback.length,
1957
+ clusters: clusterReport.clusters,
1958
+ actedOnCluster: actedOn,
1959
+ evolution,
1960
+ release,
1961
+ gate,
1962
+ promotedPrompt: promoted,
1963
+ pullRequest: null
1964
+ });
1965
+ }
1966
+ if (release.status === "fail" || gate && !gate.promote) {
1967
+ return finalize({
1968
+ opts,
1969
+ decision: "gate_failed",
1970
+ startedAt,
1971
+ now,
1972
+ observedRunCount: observedRuns.length,
1973
+ observedFeedbackCount: observedFeedback.length,
1974
+ clusters: clusterReport.clusters,
1975
+ actedOnCluster: actedOn,
1976
+ evolution,
1977
+ release,
1978
+ gate,
1979
+ promotedPrompt: promoted,
1980
+ pullRequest: null
1981
+ });
1982
+ }
1983
+ if (!opts.ship) {
1984
+ return finalize({
1985
+ opts,
1986
+ decision: "proposed_change",
1987
+ startedAt,
1988
+ now,
1989
+ observedRunCount: observedRuns.length,
1990
+ observedFeedbackCount: observedFeedback.length,
1991
+ clusters: clusterReport.clusters,
1992
+ actedOnCluster: actedOn,
1993
+ evolution,
1994
+ release,
1995
+ gate,
1996
+ promotedPrompt: promoted,
1997
+ pullRequest: null
1998
+ });
1999
+ }
2000
+ const baselineStr = toPromptString(baseline.payload);
2001
+ const promotedStr = toPromptString(promoted);
2002
+ const ctx = {
2003
+ runId: opts.runId,
2004
+ target: opts.target,
2005
+ decision: "pr_opened",
2006
+ clusters: clusterReport.clusters,
2007
+ actedOnCluster: actedOn,
2008
+ observedRunCount: observedRuns.length,
2009
+ observedFeedbackCount: observedFeedback.length,
2010
+ evolution,
2011
+ release,
2012
+ gate,
2013
+ baselinePromptString: baselineStr,
2014
+ promotedPromptString: promotedStr
2015
+ };
2016
+ const renderBody = opts.ship.renderBody ?? defaultRenderBody;
2017
+ const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
2018
+ `);
2019
+ const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
2020
+ const pr = await proposeAutomatedPullRequest(opts.ship.client, {
2021
+ repo: opts.ship.repo,
2022
+ baseBranch: opts.ship.baseBranch ?? "main",
2023
+ branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
2024
+ title: `${opts.target}: production-loop prompt update (${opts.runId})`,
2025
+ body: renderBody(ctx),
2026
+ reviewers: opts.ship.reviewers,
2027
+ labels: opts.ship.labels,
2028
+ fileChanges: [
2029
+ {
2030
+ path: opts.ship.promptFilePath,
2031
+ contents: renderFile(promotedStr, currentFile),
2032
+ rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
2033
+ }
2034
+ ],
2035
+ dryRun: opts.ship.dryRun
2036
+ });
2037
+ return finalize({
2038
+ opts,
2039
+ decision: "pr_opened",
2040
+ startedAt,
2041
+ now,
2042
+ observedRunCount: observedRuns.length,
2043
+ observedFeedbackCount: observedFeedback.length,
2044
+ clusters: clusterReport.clusters,
2045
+ actedOnCluster: actedOn,
2046
+ evolution,
2047
+ release,
2048
+ gate,
2049
+ promotedPrompt: promoted,
2050
+ pullRequest: pr
2051
+ });
2052
+ }
2053
+ function finalize(args) {
2054
+ return {
2055
+ runId: args.opts.runId,
2056
+ target: args.opts.target,
2057
+ decision: args.decision,
2058
+ startedAt: args.startedAt,
2059
+ finishedAt: args.now().toISOString(),
2060
+ observedRunCount: args.observedRunCount,
2061
+ observedFeedbackCount: args.observedFeedbackCount,
2062
+ clusters: args.clusters,
2063
+ actedOnCluster: args.actedOnCluster,
2064
+ evolution: args.evolution,
2065
+ release: args.release,
2066
+ gate: args.gate,
2067
+ baselinePrompt: args.opts.evolve.baselinePrompt,
2068
+ promotedPrompt: args.promotedPrompt,
2069
+ pullRequest: args.pullRequest,
2070
+ cron: args.opts.cron ?? null
2071
+ };
2072
+ }
2073
+ function validate2(opts) {
2074
+ if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
2075
+ if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
2076
+ if (opts.evolve.holdoutScenarios.length === 0) {
2077
+ throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
2078
+ }
2079
+ if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
2080
+ throw new ValidationError(
2081
+ "runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
2082
+ );
2083
+ }
2084
+ if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
2085
+ }
2086
+ if (opts.ship) {
2087
+ if (!opts.ship.branchPrefix.trim()) {
2088
+ throw new ValidationError("runProductionLoop: ship.branchPrefix required");
2089
+ }
2090
+ if (!opts.ship.promptFilePath.trim()) {
2091
+ throw new ValidationError("runProductionLoop: ship.promptFilePath required");
2092
+ }
2093
+ }
2094
+ }
2095
+ function uniqueIds(ids) {
2096
+ const seen = /* @__PURE__ */ new Set();
2097
+ const out = [];
2098
+ for (const id of ids) {
2099
+ if (seen.has(id)) continue;
2100
+ seen.add(id);
2101
+ out.push(id);
2102
+ }
2103
+ return out;
2104
+ }
2105
+ function deriveSearchScenarios(holdout) {
2106
+ if (holdout.length < 4) {
2107
+ return [
2108
+ {
2109
+ ...holdout[0],
2110
+ id: `${holdout[0].id}__search`
2111
+ }
2112
+ ];
2113
+ }
2114
+ return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
2115
+ }
2116
+ function syntheticRunRecord(input) {
2117
+ const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
2118
+ return {
2119
+ runId: input.runId,
2120
+ experimentId: input.target,
2121
+ candidateId: input.variant.id,
2122
+ seed: input.seed,
2123
+ model: "production-loop@synthetic",
2124
+ promptHash: "0".repeat(64),
2125
+ configHash: "0".repeat(64),
2126
+ commitSha: "0".repeat(40),
2127
+ wallMs: input.trial.durationMs ?? 1,
2128
+ costUsd: input.trial.cost ?? 0,
2129
+ tokenUsage: { input: 0, output: 0 },
2130
+ outcome: {
2131
+ [scoreKey]: input.trial.score,
2132
+ raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
2133
+ },
2134
+ splitTag: input.split,
2135
+ scenarioId: input.scenarioId
2136
+ };
2137
+ }
2138
+ function toPromptString(payload) {
2139
+ if (typeof payload === "string") return payload;
2140
+ if (payload == null) return "";
2141
+ try {
2142
+ return JSON.stringify(payload, null, 2);
2143
+ } catch {
2144
+ return String(payload);
2145
+ }
2146
+ }
2147
+ function defaultRenderBody(ctx) {
2148
+ const cluster = ctx.actedOnCluster;
2149
+ const release = ctx.release;
2150
+ const gate = ctx.gate;
2151
+ const lines = [];
2152
+ lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
2153
+ lines.push("");
2154
+ lines.push(`Run id: \`${ctx.runId}\``);
2155
+ lines.push(`Decision: \`${ctx.decision}\``);
2156
+ lines.push(
2157
+ `Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
2158
+ );
2159
+ lines.push("");
2160
+ if (cluster) {
2161
+ lines.push("### Triggering failure cluster");
2162
+ lines.push("");
2163
+ lines.push(`- **class**: \`${cluster.failureClass}\``);
2164
+ lines.push(`- **runs in cluster**: ${cluster.runCount}`);
2165
+ lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
2166
+ if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
2167
+ if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
2168
+ if (cluster.exampleError) {
2169
+ lines.push(
2170
+ `- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
2171
+ );
2172
+ }
2173
+ lines.push("");
2174
+ }
2175
+ if (gate) {
2176
+ lines.push("### Held-out promotion gate");
2177
+ lines.push("");
2178
+ lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
2179
+ lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
2180
+ lines.push(
2181
+ `- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
2182
+ );
2183
+ lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
2184
+ lines.push(
2185
+ `- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
2186
+ );
2187
+ lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
2188
+ lines.push("");
2189
+ }
2190
+ if (release) {
2191
+ lines.push("### Release confidence");
2192
+ lines.push("");
2193
+ lines.push(`- **status**: \`${release.status}\``);
2194
+ lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
2195
+ lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
2196
+ if (release.issues.length > 0) {
2197
+ lines.push("- **issues**:");
2198
+ for (const issue of release.issues) {
2199
+ lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
2200
+ }
2201
+ }
2202
+ lines.push("");
2203
+ }
2204
+ lines.push("### Prompt diff");
2205
+ lines.push("");
2206
+ lines.push("```diff");
2207
+ lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
2208
+ lines.push("```");
2209
+ return lines.join("\n");
2210
+ }
2211
+ function unifiedDiff(a, b) {
2212
+ const aLines = a.split("\n");
2213
+ const bLines = b.split("\n");
2214
+ const out = [];
2215
+ const max = Math.max(aLines.length, bLines.length);
2216
+ for (let i = 0; i < max; i++) {
2217
+ const al = aLines[i];
2218
+ const bl = bLines[i];
2219
+ if (al === bl) continue;
2220
+ if (al !== void 0) out.push(`- ${al}`);
2221
+ if (bl !== void 0) out.push(`+ ${bl}`);
2222
+ }
2223
+ return out.join("\n");
2224
+ }
2225
+
1537
2226
  // src/registry.ts
1538
2227
  var ScenarioRegistry = class {
1539
2228
  scenarios = [];
@@ -2384,36 +3073,36 @@ var FileSystemExperimentStore = class {
2384
3073
  return idx.listRuns(experimentId);
2385
3074
  }
2386
3075
  async ensureDir() {
2387
- const fs = await import("fs/promises");
2388
- await fs.mkdir(this.dir, { recursive: true });
3076
+ const fs2 = await import("fs/promises");
3077
+ await fs2.mkdir(this.dir, { recursive: true });
2389
3078
  }
2390
3079
  async append(name, record) {
2391
3080
  await this.ensureDir();
2392
- const fs = await import("fs/promises");
3081
+ const fs2 = await import("fs/promises");
2393
3082
  const path = await import("path");
2394
3083
  const active = path.join(this.dir, `${name}.ndjson`);
2395
3084
  try {
2396
- const stat = await fs.stat(active);
3085
+ const stat = await fs2.stat(active);
2397
3086
  if (stat.size >= this.maxBytes) {
2398
3087
  const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
2399
- await fs.rename(active, rolled);
3088
+ await fs2.rename(active, rolled);
2400
3089
  }
2401
3090
  } catch {
2402
3091
  }
2403
- await fs.appendFile(active, `${JSON.stringify(record)}
3092
+ await fs2.appendFile(active, `${JSON.stringify(record)}
2404
3093
  `, "utf8");
2405
3094
  }
2406
3095
  async load() {
2407
3096
  if (this.loaded && this.index) return this.index;
2408
- const fs = await import("fs/promises");
3097
+ const fs2 = await import("fs/promises");
2409
3098
  const path = await import("path");
2410
3099
  const store = new InMemoryExperimentStore();
2411
3100
  try {
2412
- const entries = await fs.readdir(this.dir);
3101
+ const entries = await fs2.readdir(this.dir);
2413
3102
  const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
2414
3103
  for (const file of sorted) {
2415
3104
  const full = path.join(this.dir, file);
2416
- const content = await fs.readFile(full, "utf8");
3105
+ const content = await fs2.readFile(full, "utf8");
2417
3106
  const base = file.split(".")[0];
2418
3107
  for (const line of content.split("\n")) {
2419
3108
  if (!line.trim()) continue;
@@ -4374,6 +5063,218 @@ function weightedKappa(a, b) {
4374
5063
  if (den === 0) return 1;
4375
5064
  return 1 - num / den;
4376
5065
  }
5066
+ function continuousAgreement(scores, opts = {}) {
5067
+ const bootstrap = opts.bootstrap ?? 1e3;
5068
+ const weights = opts.weights ?? "quadratic";
5069
+ const seed = opts.seed ?? 12648430;
5070
+ const ciLevel = opts.ciLevel ?? 0.95;
5071
+ const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
5072
+ const raters = matrix[0]?.length ?? 0;
5073
+ const clean = matrix.filter((row) => row.length === raters);
5074
+ const nClean = clean.length;
5075
+ if (nClean < 2 || raters < 2) {
5076
+ return {
5077
+ weightedKappa: NaN,
5078
+ icc: NaN,
5079
+ pearson: NaN,
5080
+ spearman: NaN,
5081
+ ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
5082
+ n: nClean,
5083
+ raters
5084
+ };
5085
+ }
5086
+ const kappa = continuousWeightedKappa(clean, weights);
5087
+ const icc = icc21(clean);
5088
+ const pearson = avgPairwise(clean, pearsonR);
5089
+ const spearman = avgPairwise(clean, spearmanR);
5090
+ const ciIcc = [NaN, NaN];
5091
+ const ciKappa = [NaN, NaN];
5092
+ if (bootstrap > 0) {
5093
+ const rng = mulberry32(seed);
5094
+ const iccs = [];
5095
+ const kappas = [];
5096
+ for (let b = 0; b < bootstrap; b++) {
5097
+ const sample = new Array(nClean);
5098
+ for (let i = 0; i < nClean; i++) {
5099
+ sample[i] = clean[Math.floor(rng() * nClean)];
5100
+ }
5101
+ const iccB = icc21(sample);
5102
+ const kB = continuousWeightedKappa(sample, weights);
5103
+ if (Number.isFinite(iccB)) iccs.push(iccB);
5104
+ if (Number.isFinite(kB)) kappas.push(kB);
5105
+ }
5106
+ const [lo, hi] = percentileBounds(ciLevel);
5107
+ if (iccs.length > 0) {
5108
+ iccs.sort((a, b) => a - b);
5109
+ ciIcc[0] = quantile(iccs, lo);
5110
+ ciIcc[1] = quantile(iccs, hi);
5111
+ }
5112
+ if (kappas.length > 0) {
5113
+ kappas.sort((a, b) => a - b);
5114
+ ciKappa[0] = quantile(kappas, lo);
5115
+ ciKappa[1] = quantile(kappas, hi);
5116
+ }
5117
+ }
5118
+ return {
5119
+ weightedKappa: kappa,
5120
+ icc,
5121
+ pearson,
5122
+ spearman,
5123
+ ci: { icc: ciIcc, weightedKappa: ciKappa },
5124
+ n: nClean,
5125
+ raters
5126
+ };
5127
+ }
5128
+ function calibrateJudgeContinuous(golden, candidate, opts = {}) {
5129
+ const base = calibrateJudge(golden, candidate);
5130
+ const map = /* @__PURE__ */ new Map();
5131
+ for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
5132
+ for (const c of candidate) {
5133
+ const entry = map.get(c.itemId);
5134
+ if (entry) entry.j = c.score;
5135
+ }
5136
+ const rows = [];
5137
+ for (const v of map.values()) {
5138
+ if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
5139
+ }
5140
+ const agreement = continuousAgreement(rows, opts);
5141
+ return {
5142
+ ...base,
5143
+ weightedKappaContinuous: agreement.weightedKappa,
5144
+ icc: agreement.icc,
5145
+ spearman: agreement.spearman,
5146
+ ci: agreement.ci
5147
+ };
5148
+ }
5149
+ function continuousWeightedKappa(rows, scheme) {
5150
+ if (rows.length === 0) return NaN;
5151
+ const raters = rows[0].length;
5152
+ if (raters < 2) return NaN;
5153
+ const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
5154
+ let sum2 = 0;
5155
+ let pairs = 0;
5156
+ for (let r1 = 0; r1 < raters; r1++) {
5157
+ for (let r2 = r1 + 1; r2 < raters; r2++) {
5158
+ const a = rows.map((row) => row[r1]);
5159
+ const b = rows.map((row) => row[r2]);
5160
+ const n = a.length;
5161
+ let obs = 0;
5162
+ for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
5163
+ obs /= n;
5164
+ let exp = 0;
5165
+ for (let i = 0; i < n; i++) {
5166
+ for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
5167
+ }
5168
+ exp /= n * n;
5169
+ if (exp === 0) {
5170
+ sum2 += obs === 0 ? 1 : 0;
5171
+ } else {
5172
+ sum2 += 1 - obs / exp;
5173
+ }
5174
+ pairs++;
5175
+ }
5176
+ }
5177
+ return pairs === 0 ? NaN : sum2 / pairs;
5178
+ }
5179
+ function icc21(rows) {
5180
+ const n = rows.length;
5181
+ if (n < 2) return NaN;
5182
+ const k = rows[0].length;
5183
+ if (k < 2) return NaN;
5184
+ const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
5185
+ const colMeans = new Array(k).fill(0);
5186
+ for (let j = 0; j < k; j++) {
5187
+ let s = 0;
5188
+ for (let i = 0; i < n; i++) s += rows[i][j];
5189
+ colMeans[j] = s / n;
5190
+ }
5191
+ let grand = 0;
5192
+ for (let i = 0; i < n; i++) grand += rowMeans[i];
5193
+ grand /= n;
5194
+ let ssR = 0;
5195
+ for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
5196
+ ssR *= k;
5197
+ let ssC = 0;
5198
+ for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
5199
+ ssC *= n;
5200
+ let ssT = 0;
5201
+ for (let i = 0; i < n; i++) {
5202
+ for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
5203
+ }
5204
+ const ssE = ssT - ssR - ssC;
5205
+ const dfR = n - 1;
5206
+ const dfC = k - 1;
5207
+ const dfE = (n - 1) * (k - 1);
5208
+ const msR = ssR / dfR;
5209
+ const msC = ssC / dfC;
5210
+ const msE = dfE > 0 ? ssE / dfE : 0;
5211
+ const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
5212
+ if (denom === 0) {
5213
+ return msR === 0 && msE === 0 ? 1 : 0;
5214
+ }
5215
+ return (msR - msE) / denom;
5216
+ }
5217
+ function avgPairwise(rows, fn) {
5218
+ const k = rows[0]?.length ?? 0;
5219
+ if (k < 2) return NaN;
5220
+ let sum2 = 0;
5221
+ let pairs = 0;
5222
+ for (let i = 0; i < k; i++) {
5223
+ for (let j = i + 1; j < k; j++) {
5224
+ const a = rows.map((row) => row[i]);
5225
+ const b = rows.map((row) => row[j]);
5226
+ const r = fn(a, b);
5227
+ if (Number.isFinite(r)) {
5228
+ sum2 += r;
5229
+ pairs++;
5230
+ }
5231
+ }
5232
+ }
5233
+ return pairs === 0 ? NaN : sum2 / pairs;
5234
+ }
5235
+ function spearmanR(a, b) {
5236
+ if (a.length !== b.length || a.length < 2) return NaN;
5237
+ return pearsonR(rankWithTies(a), rankWithTies(b));
5238
+ }
5239
+ function rankWithTies(xs) {
5240
+ const n = xs.length;
5241
+ const indexed = xs.map((v, i2) => ({ v, i: i2 }));
5242
+ indexed.sort((x, y) => x.v - y.v);
5243
+ const ranks = new Array(n).fill(0);
5244
+ let i = 0;
5245
+ while (i < n) {
5246
+ let j = i;
5247
+ while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
5248
+ const avg = (i + j) / 2 + 1;
5249
+ for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
5250
+ i = j + 1;
5251
+ }
5252
+ return ranks;
5253
+ }
5254
+ function mulberry32(seed) {
5255
+ let a = seed >>> 0;
5256
+ return () => {
5257
+ a = a + 1831565813 >>> 0;
5258
+ let t = a;
5259
+ t = Math.imul(t ^ t >>> 15, t | 1);
5260
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
5261
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
5262
+ };
5263
+ }
5264
+ function percentileBounds(ciLevel) {
5265
+ const tail = (1 - ciLevel) / 2;
5266
+ return [tail, 1 - tail];
5267
+ }
5268
+ function quantile(sorted, q) {
5269
+ if (sorted.length === 0) return NaN;
5270
+ if (sorted.length === 1) return sorted[0];
5271
+ const pos = q * (sorted.length - 1);
5272
+ const lo = Math.floor(pos);
5273
+ const hi = Math.ceil(pos);
5274
+ if (lo === hi) return sorted[lo];
5275
+ const frac = pos - lo;
5276
+ return sorted[lo] * (1 - frac) + sorted[hi] * frac;
5277
+ }
4377
5278
 
4378
5279
  // src/observability.ts
4379
5280
  async function toLangfuseEnvelope(store, runId) {
@@ -4875,7 +5776,7 @@ async function commitBisect(options) {
4875
5776
  }
4876
5777
  async function promptBisect(options) {
4877
5778
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
4878
- const join3 = (paragraphs) => paragraphs.join("\n\n");
5779
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
4879
5780
  const goodParas = split(options.good);
4880
5781
  const badParas = split(options.bad);
4881
5782
  if (goodParas.length !== badParas.length) {
@@ -4895,7 +5796,7 @@ async function promptBisect(options) {
4895
5796
  const result = await bisect({
4896
5797
  good: goodMask,
4897
5798
  bad: badMask,
4898
- runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
5799
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
4899
5800
  maxIterations: options.maxIterations ?? n + 5,
4900
5801
  halfway: (g, b) => {
4901
5802
  for (let i = 0; i < g.length; i++) {
@@ -4926,12 +5827,12 @@ async function promptBisect(options) {
4926
5827
  }
4927
5828
  }
4928
5829
  const materializedPath = result.path.map((s) => ({
4929
- state: join3(paragraphsFor(s.state)),
5830
+ state: join4(paragraphsFor(s.state)),
4930
5831
  score: s.score,
4931
5832
  pass: s.pass
4932
5833
  }));
4933
5834
  return {
4934
- culprit: join3(paragraphsFor(culprit)),
5835
+ culprit: join4(paragraphsFor(culprit)),
4935
5836
  path: materializedPath,
4936
5837
  converged: result.converged,
4937
5838
  inputInconsistent: result.inputInconsistent,
@@ -5176,7 +6077,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
5176
6077
  runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
5177
6078
  }
5178
6079
  const runCounts = [...runCountByScenario.values()];
5179
- const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
6080
+ const p25 = runCounts.length > 0 ? quantile2(runCounts, 0.25) : 0;
5180
6081
  for (const s of scenarios) {
5181
6082
  const count = runCountByScenario.get(s.id) ?? 0;
5182
6083
  if (count <= p25 && count < 3) {
@@ -5230,7 +6131,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
5230
6131
  }
5231
6132
  return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
5232
6133
  }
5233
- function quantile(xs, p) {
6134
+ function quantile2(xs, p) {
5234
6135
  const sorted = [...xs].sort((a, b) => a - b);
5235
6136
  const idx = p * (sorted.length - 1);
5236
6137
  const lo = Math.floor(idx);
@@ -7619,6 +8520,52 @@ function createCompositeMutator(opts) {
7619
8520
  };
7620
8521
  }
7621
8522
 
8523
+ // src/discover-personas.ts
8524
+ import { promises as fs } from "fs";
8525
+ import { basename, extname, join as join3 } from "path";
8526
+ var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
8527
+ async function discoverPersonas(dir, opts = {}) {
8528
+ const pattern = opts.pattern ?? DEFAULT_PATTERN;
8529
+ const exclude = new Set(opts.exclude ?? []);
8530
+ const include = opts.include;
8531
+ async function walk(d) {
8532
+ let entries;
8533
+ try {
8534
+ const raw = await fs.readdir(d, { withFileTypes: true });
8535
+ entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
8536
+ } catch (err) {
8537
+ const code = err.code;
8538
+ if (code === "ENOENT") return [];
8539
+ throw err;
8540
+ }
8541
+ const out = [];
8542
+ for (const entry of entries) {
8543
+ const full = join3(d, entry.name);
8544
+ if (entry.isDir) {
8545
+ if (opts.recursive) out.push(...await walk(full));
8546
+ continue;
8547
+ }
8548
+ if (!pattern.test(entry.name)) continue;
8549
+ if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
8550
+ continue;
8551
+ if (include && include.length > 0) {
8552
+ const id = basename(entry.name, extname(entry.name));
8553
+ const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
8554
+ if (!matched) continue;
8555
+ }
8556
+ out.push({
8557
+ path: full,
8558
+ filename: entry.name,
8559
+ id: basename(entry.name, extname(entry.name))
8560
+ });
8561
+ }
8562
+ return out;
8563
+ }
8564
+ const results = await walk(dir);
8565
+ results.sort((a, b) => a.filename.localeCompare(b.filename));
8566
+ return results;
8567
+ }
8568
+
7622
8569
  // src/evolution-telemetry.ts
7623
8570
  import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
7624
8571
  import { dirname as dirname3 } from "path";
@@ -8008,6 +8955,90 @@ var JsonlTrialCache = class {
8008
8955
  }
8009
8956
  };
8010
8957
 
8958
+ // src/judge-retry.ts
8959
+ var DEFAULT_MAX_ATTEMPTS = 3;
8960
+ var DEFAULT_TIMEOUT_MS = 9e4;
8961
+ var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
8962
+ var ABORT_PATTERNS = [
8963
+ /AbortError/i,
8964
+ /TimeoutError/i,
8965
+ /fetch failed/i,
8966
+ /ECONNRESET/i,
8967
+ /ETIMEDOUT/i,
8968
+ /EAI_AGAIN/i,
8969
+ /this operation was aborted/i,
8970
+ /stream.*ended.*unexpectedly/i,
8971
+ /socket hang up/i
8972
+ ];
8973
+ var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8974
+ function defaultIsRetryable(err) {
8975
+ if (err instanceof Error) {
8976
+ if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
8977
+ const status = err.status;
8978
+ if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
8979
+ }
8980
+ return false;
8981
+ }
8982
+ function sleep(ms) {
8983
+ return new Promise((resolve) => setTimeout(resolve, ms));
8984
+ }
8985
+ async function withJudgeRetry(judgeFn, policy = {}) {
8986
+ const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
8987
+ const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
8988
+ const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
8989
+ const isRetryable = policy.isRetryable ?? defaultIsRetryable;
8990
+ const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
8991
+ let totalAttempts = 0;
8992
+ const attemptErrors = [];
8993
+ let lastError;
8994
+ for (const model of models) {
8995
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
8996
+ totalAttempts += 1;
8997
+ const controller = new AbortController();
8998
+ const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
8999
+ try {
9000
+ const value = await judgeFn(model, controller.signal);
9001
+ clearTimeout(timer);
9002
+ return {
9003
+ value,
9004
+ succeeded: true,
9005
+ attempts: totalAttempts,
9006
+ modelUsed: model,
9007
+ attemptErrors
9008
+ };
9009
+ } catch (err) {
9010
+ clearTimeout(timer);
9011
+ const errObj = err instanceof Error ? err : new Error(String(err));
9012
+ lastError = errObj;
9013
+ attemptErrors.push({
9014
+ attempt: totalAttempts,
9015
+ model: model ?? "(default)",
9016
+ error: errObj.message
9017
+ });
9018
+ if (!isRetryable(errObj)) {
9019
+ return {
9020
+ value: null,
9021
+ succeeded: false,
9022
+ attempts: totalAttempts,
9023
+ error: errObj,
9024
+ attemptErrors
9025
+ };
9026
+ }
9027
+ if (attempt < maxAttempts - 1) {
9028
+ await sleep(backoff(attempt));
9029
+ }
9030
+ }
9031
+ }
9032
+ }
9033
+ return {
9034
+ value: null,
9035
+ succeeded: false,
9036
+ attempts: totalAttempts,
9037
+ error: lastError,
9038
+ attemptErrors
9039
+ };
9040
+ }
9041
+
8011
9042
  // src/orthogonality.ts
8012
9043
  function passOrthogonality(input) {
8013
9044
  const passes = input.passes;
@@ -8225,6 +9256,55 @@ function createSandboxPool(opts) {
8225
9256
  utilization
8226
9257
  };
8227
9258
  }
9259
+
9260
+ // src/trial-aggregator.ts
9261
+ function meanOf(xs) {
9262
+ if (xs.length === 0) return 0;
9263
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
9264
+ }
9265
+ function meanMetrics(rows) {
9266
+ if (rows.length === 0) return {};
9267
+ const keys = /* @__PURE__ */ new Set();
9268
+ for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
9269
+ const out = {};
9270
+ for (const k of keys) {
9271
+ const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
9272
+ if (xs.length > 0) out[k] = meanOf(xs);
9273
+ }
9274
+ return out;
9275
+ }
9276
+ function aggregateTrialsByMode(trials, opts) {
9277
+ const gradedTrials = trials.filter((t) => !t.error);
9278
+ const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
9279
+ const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
9280
+ if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
9281
+ return {
9282
+ meanScore: 0,
9283
+ meanCost: 0,
9284
+ meanDurationMs: 0,
9285
+ okRate: 0,
9286
+ countedTrials: 0,
9287
+ excludedFailedTrials: judgeFailed.length,
9288
+ totalTrials: trials.length,
9289
+ metrics: {},
9290
+ strictFailure: {
9291
+ failedCount: judgeFailed.length,
9292
+ firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
9293
+ }
9294
+ };
9295
+ }
9296
+ const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
9297
+ return {
9298
+ meanScore: meanOf(counted.map((t) => t.score)),
9299
+ meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
9300
+ meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
9301
+ okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
9302
+ countedTrials: counted.length,
9303
+ excludedFailedTrials: judgeFailed.length,
9304
+ totalTrials: trials.length,
9305
+ metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
9306
+ };
9307
+ }
8228
9308
  export {
8229
9309
  AgentDriver,
8230
9310
  AgentEvalError,
@@ -8314,6 +9394,7 @@ export {
8314
9394
  adversarialJudge,
8315
9395
  aggregateLlm,
8316
9396
  aggregateRunScore,
9397
+ aggregateTrialsByMode,
8317
9398
  allCriticalPassed,
8318
9399
  analyzeAntiSlop,
8319
9400
  analyzeSeries,
@@ -8336,6 +9417,7 @@ export {
8336
9417
  buildTrajectory,
8337
9418
  byteLengthRange,
8338
9419
  calibrateJudge,
9420
+ calibrateJudgeContinuous,
8339
9421
  callLlm,
8340
9422
  callLlmJson,
8341
9423
  canaryLeakView,
@@ -8360,6 +9442,7 @@ export {
8360
9442
  computeToolUseMetrics,
8361
9443
  confidenceInterval,
8362
9444
  containsAll,
9445
+ continuousAgreement,
8363
9446
  controlFailureClassFromVerification,
8364
9447
  controlRunToFeedbackTrajectory,
8365
9448
  controlRunToRunRecord,
@@ -8384,6 +9467,7 @@ export {
8384
9467
  defaultProviderRedactor,
8385
9468
  defaultReferenceReplayMatcher,
8386
9469
  deployGateLayer,
9470
+ discoverPersonas,
8387
9471
  distillPlaybook,
8388
9472
  dominates,
8389
9473
  estimateCost,
@@ -8417,6 +9501,7 @@ export {
8417
9501
  formatDriverReport,
8418
9502
  formatFindings,
8419
9503
  gainHistogram,
9504
+ ghCliClient,
8420
9505
  precision as goldenPrecision,
8421
9506
  gradeSemanticStatus,
8422
9507
  groupBy,
@@ -8424,6 +9509,7 @@ export {
8424
9509
  hashJson,
8425
9510
  hashScenarios,
8426
9511
  htmlContainsElement,
9512
+ httpGithubClient,
8427
9513
  inMemoryReferenceReplayStore,
8428
9514
  inMemoryReviewStore,
8429
9515
  integrationAsi,
@@ -8484,6 +9570,7 @@ export {
8484
9570
  printDriverSummary,
8485
9571
  probeLlm,
8486
9572
  promptBisect,
9573
+ proposeAutomatedPullRequest,
8487
9574
  proposeSynthesisTargets,
8488
9575
  providerFromBaseUrl,
8489
9576
  pytestTestParser,
@@ -8528,6 +9615,7 @@ export {
8528
9615
  runKeywordCoverageJudgeUrl,
8529
9616
  runLiveProof,
8530
9617
  runMultiShotOptimization,
9618
+ runProductionLoop,
8531
9619
  runPromptEvolution,
8532
9620
  runProposeReview,
8533
9621
  runProposeReviewAsControlLoop,
@@ -8582,6 +9670,7 @@ export {
8582
9670
  whitespaceCollapseMutator,
8583
9671
  wilcoxonSignedRank,
8584
9672
  withAssignedFeedbackSplit,
9673
+ withJudgeRetry,
8585
9674
  wranglerDeployRunner
8586
9675
  };
8587
9676
  //# sourceMappingURL=index.js.map