@tangle-network/agent-eval 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,9 +8,10 @@ import {
8
8
  classifyFailure,
9
9
  compareToBaseline,
10
10
  computeToolUseMetrics,
11
+ failureClusterView,
11
12
  iqr,
12
13
  welchsTTest
13
- } from "./chunk-OHEPNJQN.js";
14
+ } from "./chunk-JLZQWFV3.js";
14
15
  import {
15
16
  exportTrainingData,
16
17
  toNdjson
@@ -95,7 +96,7 @@ import {
95
96
  summarizePreferenceMemory,
96
97
  trialTraceFromMultiShotTrial,
97
98
  withAssignedFeedbackSplit
98
- } from "./chunk-VRJVTXRV.js";
99
+ } from "./chunk-EDUKQ5AM.js";
99
100
  import {
100
101
  RunRecordValidationError,
101
102
  isRunRecord,
@@ -220,6 +221,304 @@ import {
220
221
  } from "./chunk-NG236HPC.js";
221
222
  import "./chunk-PZ5AY32C.js";
222
223
 
224
+ // src/auto-pr.ts
225
+ async function proposeAutomatedPullRequest(client, input) {
226
+ validate(input);
227
+ return client.proposeChange(input);
228
+ }
229
+ function validate(input) {
230
+ if (!input.repo.owner.trim() || !input.repo.name.trim()) {
231
+ throw new ValidationError("proposeAutomatedPullRequest: repo.owner and repo.name required");
232
+ }
233
+ if (!input.branchName.trim() || /\s/.test(input.branchName)) {
234
+ throw new ValidationError(
235
+ "proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace"
236
+ );
237
+ }
238
+ if (input.branchName === (input.baseBranch ?? "main")) {
239
+ throw new ValidationError("proposeAutomatedPullRequest: branchName must differ from baseBranch");
240
+ }
241
+ if (input.fileChanges.length === 0) {
242
+ throw new ValidationError("proposeAutomatedPullRequest: fileChanges must not be empty");
243
+ }
244
+ const seenPaths = /* @__PURE__ */ new Set();
245
+ for (const change of input.fileChanges) {
246
+ if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
247
+ throw new ValidationError(
248
+ `proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
249
+ );
250
+ }
251
+ if (seenPaths.has(change.path)) {
252
+ throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
253
+ }
254
+ seenPaths.add(change.path);
255
+ }
256
+ if (!input.title.trim()) {
257
+ throw new ValidationError("proposeAutomatedPullRequest: title must not be empty");
258
+ }
259
+ }
260
+ function httpGithubClient(opts) {
261
+ const fetchImpl = opts.fetchImpl ?? fetch;
262
+ const apiBase = (opts.apiBase ?? "https://api.github.com").replace(/\/+$/, "");
263
+ const now = opts.now ?? (() => /* @__PURE__ */ new Date());
264
+ async function api(method, path, body, accept404 = false) {
265
+ const res = await fetchImpl(`${apiBase}${path}`, {
266
+ method,
267
+ headers: {
268
+ accept: "application/vnd.github+json",
269
+ "content-type": "application/json",
270
+ authorization: `Bearer ${opts.token}`,
271
+ "x-github-api-version": "2022-11-28"
272
+ },
273
+ body: body === void 0 ? void 0 : JSON.stringify(body)
274
+ });
275
+ if (accept404 && res.status === 404) return null;
276
+ if (!res.ok) {
277
+ const text = await res.text().catch(() => "");
278
+ throw new ConfigError(
279
+ `proposeAutomatedPullRequest: GitHub ${method} ${path} \u2192 ${res.status} ${text.slice(0, 400)}`
280
+ );
281
+ }
282
+ return await res.json();
283
+ }
284
+ return {
285
+ async proposeChange(input) {
286
+ const baseBranch = input.baseBranch ?? "main";
287
+ const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`;
288
+ if (input.dryRun) {
289
+ return {
290
+ prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
291
+ branchName: input.branchName,
292
+ headSha: "dry-run",
293
+ dryRun: true
294
+ };
295
+ }
296
+ const baseRef = await api("GET", `${repoPath}/git/ref/heads/${baseBranch}`);
297
+ if (!baseRef) {
298
+ throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`);
299
+ }
300
+ const baseSha = baseRef.object.sha;
301
+ const baseCommit = await api("GET", `${repoPath}/git/commits/${baseSha}`);
302
+ if (!baseCommit) {
303
+ throw new ConfigError(
304
+ `proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`
305
+ );
306
+ }
307
+ const treeEntries = [];
308
+ for (const change of input.fileChanges) {
309
+ const blob = await api("POST", `${repoPath}/git/blobs`, {
310
+ content: change.contents,
311
+ encoding: "utf-8"
312
+ });
313
+ if (!blob) throw new ConfigError("proposeAutomatedPullRequest: blob creation returned null");
314
+ treeEntries.push({
315
+ path: change.path,
316
+ mode: "100644",
317
+ type: "blob",
318
+ sha: blob.sha
319
+ });
320
+ }
321
+ const tree = await api("POST", `${repoPath}/git/trees`, {
322
+ base_tree: baseCommit.tree.sha,
323
+ tree: treeEntries
324
+ });
325
+ if (!tree) throw new ConfigError("proposeAutomatedPullRequest: tree creation returned null");
326
+ const author = input.authorName && input.authorEmail ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() } : void 0;
327
+ const commitMessage = renderCommitMessage(input);
328
+ const commit = await api("POST", `${repoPath}/git/commits`, {
329
+ message: commitMessage,
330
+ tree: tree.sha,
331
+ parents: [baseSha],
332
+ ...author ? { author, committer: author } : {}
333
+ });
334
+ if (!commit)
335
+ throw new ConfigError("proposeAutomatedPullRequest: commit creation returned null");
336
+ const existing = await api(
337
+ "GET",
338
+ `${repoPath}/git/ref/heads/${input.branchName}`,
339
+ void 0,
340
+ true
341
+ );
342
+ if (!existing) {
343
+ await api("POST", `${repoPath}/git/refs`, {
344
+ ref: `refs/heads/${input.branchName}`,
345
+ sha: commit.sha
346
+ });
347
+ } else if (existing.object.sha !== commit.sha) {
348
+ await api("PATCH", `${repoPath}/git/refs/heads/${input.branchName}`, {
349
+ sha: commit.sha,
350
+ force: true
351
+ });
352
+ }
353
+ const openPrs = await api(
354
+ "GET",
355
+ `${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`
356
+ );
357
+ let pr;
358
+ if (openPrs && openPrs.length > 0) {
359
+ pr = openPrs[0];
360
+ } else {
361
+ const created = await api("POST", `${repoPath}/pulls`, {
362
+ title: input.title,
363
+ body: input.body,
364
+ head: input.branchName,
365
+ base: baseBranch
366
+ });
367
+ if (!created)
368
+ throw new ConfigError("proposeAutomatedPullRequest: PR creation returned null");
369
+ pr = created;
370
+ }
371
+ if (input.reviewers && input.reviewers.length > 0) {
372
+ await api(
373
+ "POST",
374
+ `${repoPath}/pulls/${pr.number}/requested_reviewers`,
375
+ { reviewers: input.reviewers },
376
+ true
377
+ ).catch(() => {
378
+ });
379
+ }
380
+ if (input.labels && input.labels.length > 0) {
381
+ await api(
382
+ "POST",
383
+ `${repoPath}/issues/${pr.number}/labels`,
384
+ { labels: input.labels },
385
+ true
386
+ ).catch(() => {
387
+ });
388
+ }
389
+ return {
390
+ prUrl: pr.html_url,
391
+ branchName: input.branchName,
392
+ headSha: commit.sha,
393
+ dryRun: false
394
+ };
395
+ }
396
+ };
397
+ }
398
+ function ghCliClient(opts = {}) {
399
+ const bin = opts.bin ?? "gh";
400
+ const cwd = opts.cwd ?? process.cwd();
401
+ const exec = opts.exec ?? defaultExec;
402
+ async function run(cmd, args, stdin) {
403
+ const r = await exec(cmd, args, { cwd, stdin });
404
+ if (r.exitCode !== 0) {
405
+ throw new ConfigError(
406
+ `proposeAutomatedPullRequest: ${cmd} ${args.join(" ")} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`
407
+ );
408
+ }
409
+ return r;
410
+ }
411
+ return {
412
+ async proposeChange(input) {
413
+ const baseBranch = input.baseBranch ?? "main";
414
+ if (input.dryRun) {
415
+ return {
416
+ prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
417
+ branchName: input.branchName,
418
+ headSha: "dry-run",
419
+ dryRun: true
420
+ };
421
+ }
422
+ await run("git", ["fetch", "origin", baseBranch]);
423
+ await run("git", ["checkout", baseBranch]);
424
+ await run("git", ["reset", "--hard", `origin/${baseBranch}`]);
425
+ await exec("git", ["branch", "-D", input.branchName], { cwd });
426
+ await run("git", ["checkout", "-b", input.branchName]);
427
+ const { mkdir, writeFile } = await import("fs/promises");
428
+ const { dirname: dirname5, join: join3, resolve } = await import("path");
429
+ for (const change of input.fileChanges) {
430
+ const abs = resolve(cwd, change.path);
431
+ await mkdir(dirname5(abs), { recursive: true });
432
+ await writeFile(abs, change.contents, "utf8");
433
+ await run("git", ["add", join3(change.path)]);
434
+ }
435
+ const env = {};
436
+ if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
437
+ if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail;
438
+ if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName;
439
+ if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail;
440
+ const message = renderCommitMessage(input);
441
+ await run("git", ["commit", "-m", message]);
442
+ const headRes = await run("git", ["rev-parse", "HEAD"]);
443
+ const headSha = headRes.stdout.trim();
444
+ await run("git", ["push", "-f", "origin", input.branchName]);
445
+ const existing = await exec(
446
+ bin,
447
+ [
448
+ "pr",
449
+ "list",
450
+ "--state",
451
+ "open",
452
+ "--head",
453
+ input.branchName,
454
+ "--json",
455
+ "url,number",
456
+ "--limit",
457
+ "1"
458
+ ],
459
+ { cwd }
460
+ );
461
+ let prUrl = "";
462
+ if (existing.exitCode === 0 && existing.stdout.trim()) {
463
+ const parsed = JSON.parse(existing.stdout);
464
+ if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url;
465
+ }
466
+ if (!prUrl) {
467
+ const args = [
468
+ "pr",
469
+ "create",
470
+ "--title",
471
+ input.title,
472
+ "--body",
473
+ input.body,
474
+ "--base",
475
+ baseBranch
476
+ ];
477
+ if (input.reviewers && input.reviewers.length > 0) {
478
+ args.push("--reviewer", input.reviewers.join(","));
479
+ }
480
+ if (input.labels && input.labels.length > 0) {
481
+ args.push("--label", input.labels.join(","));
482
+ }
483
+ const r = await run(bin, args);
484
+ const match = r.stdout.match(/https?:\/\/\S+/);
485
+ prUrl = match ? match[0] : r.stdout.trim();
486
+ }
487
+ return { prUrl, branchName: input.branchName, headSha, dryRun: false };
488
+ }
489
+ };
490
+ }
491
+ async function defaultExec(bin, args, opts) {
492
+ const { spawn } = await import("child_process");
493
+ return new Promise((resolveExec) => {
494
+ const child = spawn(bin, args, { cwd: opts.cwd });
495
+ let stdout = "";
496
+ let stderr = "";
497
+ child.stdout.on("data", (d) => {
498
+ stdout += d.toString();
499
+ });
500
+ child.stderr.on("data", (d) => {
501
+ stderr += d.toString();
502
+ });
503
+ if (opts.stdin) child.stdin.end(opts.stdin);
504
+ child.on("error", (err) => {
505
+ resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 });
506
+ });
507
+ child.on("close", (code) => {
508
+ resolveExec({ stdout, stderr, exitCode: code ?? 1 });
509
+ });
510
+ });
511
+ }
512
+ function renderCommitMessage(input) {
513
+ const lines = [input.title, ""];
514
+ for (const change of input.fileChanges) {
515
+ if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`);
516
+ }
517
+ if (lines[lines.length - 1] !== "") lines.push("");
518
+ lines.push(input.body.trim());
519
+ return lines.join("\n").trim();
520
+ }
521
+
223
522
  // src/executor.ts
224
523
  async function executeScenario(tc, scenario, config) {
225
524
  const startTime = Date.now();
@@ -1534,6 +1833,396 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
1534
1833
  };
1535
1834
  }
1536
1835
 
1836
+ // src/production-loop.ts
1837
+ async function runProductionLoop(opts) {
1838
+ validate2(opts);
1839
+ const now = opts.now ?? (() => /* @__PURE__ */ new Date());
1840
+ const startedAt = now().toISOString();
1841
+ const observedRuns = await opts.traceStore.listRuns();
1842
+ const observedFeedback = await opts.feedbackStore.list();
1843
+ const clusterReport = await failureClusterView(opts.traceStore, {
1844
+ minClusterSize: opts.cluster.minClusterSize ?? 1
1845
+ });
1846
+ const minSize = opts.cluster.minClusterSize ?? 5;
1847
+ const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
1848
+ const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
1849
+ const totalRuns = clusterReport.totalRuns;
1850
+ const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
1851
+ if (actionable.length === 0) {
1852
+ return finalize({
1853
+ opts,
1854
+ decision: "no_actionable_failures",
1855
+ startedAt,
1856
+ now,
1857
+ observedRunCount: observedRuns.length,
1858
+ observedFeedbackCount: observedFeedback.length,
1859
+ clusters: clusterReport.clusters,
1860
+ actedOnCluster: null,
1861
+ evolution: null,
1862
+ release: null,
1863
+ gate: null,
1864
+ promotedPrompt: opts.evolve.baselinePrompt,
1865
+ pullRequest: null
1866
+ });
1867
+ }
1868
+ const actedOn = actionable[0];
1869
+ const baseline = {
1870
+ id: opts.evolve.baselineId ?? "baseline",
1871
+ label: opts.evolve.baselineId ?? "baseline",
1872
+ generation: 0,
1873
+ payload: opts.evolve.baselinePrompt
1874
+ };
1875
+ const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
1876
+ const searchIds = uniqueIds(
1877
+ (opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
1878
+ (s) => s.id
1879
+ )
1880
+ );
1881
+ if (searchIds.some((id) => holdoutIds.includes(id))) {
1882
+ throw new ValidationError(
1883
+ "runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
1884
+ );
1885
+ }
1886
+ const reps = opts.evolve.reps ?? 3;
1887
+ const generations = opts.evolve.generations ?? 3;
1888
+ const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
1889
+ const evolution = await runMultiShotOptimization({
1890
+ runId: `${opts.runId}/evolve`,
1891
+ target: opts.target,
1892
+ seedVariants: [baseline],
1893
+ searchScenarioIds: searchIds,
1894
+ reps,
1895
+ generations,
1896
+ populationSize,
1897
+ scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
1898
+ runner: opts.evolve.runner,
1899
+ scorer: opts.evolve.scorer,
1900
+ mutateAdapter: opts.evolve.mutator,
1901
+ gate: {
1902
+ holdoutScenarioIds: holdoutIds,
1903
+ reps,
1904
+ gate: { ...opts.evolve.gate, baselineKey: baseline.id },
1905
+ toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
1906
+ runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
1907
+ variant,
1908
+ scenarioId,
1909
+ rep,
1910
+ split,
1911
+ seed,
1912
+ trial,
1913
+ target: opts.target
1914
+ }))
1915
+ }
1916
+ });
1917
+ const gate = evolution.gate?.decision ?? null;
1918
+ const promotedVariant = evolution.promotedVariant;
1919
+ const promoted = promotedVariant.payload;
1920
+ const promotedChanged = promotedVariant.id !== baseline.id;
1921
+ const allTrials = evolution.evolution.generations.flatMap(
1922
+ (g) => g.trials
1923
+ );
1924
+ const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
1925
+ const releaseScenarios = [
1926
+ ...(opts.evolve.searchScenarios ?? []).map((s) => ({
1927
+ id: s.id,
1928
+ payload: s,
1929
+ split: "train",
1930
+ tags: { persona: s.persona, label: s.label }
1931
+ })),
1932
+ ...opts.evolve.holdoutScenarios.map((s) => ({
1933
+ id: s.id,
1934
+ payload: s,
1935
+ split: "holdout",
1936
+ tags: { persona: s.persona, label: s.label }
1937
+ }))
1938
+ ];
1939
+ const release = evaluateReleaseConfidence({
1940
+ target: opts.target,
1941
+ candidateId: promotedVariant.id,
1942
+ baselineId: baseline.id,
1943
+ scenarios: releaseScenarios,
1944
+ traces: traceEvidence,
1945
+ gateDecision: gate ?? void 0,
1946
+ thresholds: opts.releaseThresholds,
1947
+ runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
1948
+ });
1949
+ if (!promotedChanged) {
1950
+ return finalize({
1951
+ opts,
1952
+ decision: "evolve_yielded_no_improvement",
1953
+ startedAt,
1954
+ now,
1955
+ observedRunCount: observedRuns.length,
1956
+ observedFeedbackCount: observedFeedback.length,
1957
+ clusters: clusterReport.clusters,
1958
+ actedOnCluster: actedOn,
1959
+ evolution,
1960
+ release,
1961
+ gate,
1962
+ promotedPrompt: promoted,
1963
+ pullRequest: null
1964
+ });
1965
+ }
1966
+ if (release.status === "fail" || gate && !gate.promote) {
1967
+ return finalize({
1968
+ opts,
1969
+ decision: "gate_failed",
1970
+ startedAt,
1971
+ now,
1972
+ observedRunCount: observedRuns.length,
1973
+ observedFeedbackCount: observedFeedback.length,
1974
+ clusters: clusterReport.clusters,
1975
+ actedOnCluster: actedOn,
1976
+ evolution,
1977
+ release,
1978
+ gate,
1979
+ promotedPrompt: promoted,
1980
+ pullRequest: null
1981
+ });
1982
+ }
1983
+ if (!opts.ship) {
1984
+ return finalize({
1985
+ opts,
1986
+ decision: "proposed_change",
1987
+ startedAt,
1988
+ now,
1989
+ observedRunCount: observedRuns.length,
1990
+ observedFeedbackCount: observedFeedback.length,
1991
+ clusters: clusterReport.clusters,
1992
+ actedOnCluster: actedOn,
1993
+ evolution,
1994
+ release,
1995
+ gate,
1996
+ promotedPrompt: promoted,
1997
+ pullRequest: null
1998
+ });
1999
+ }
2000
+ const baselineStr = toPromptString(baseline.payload);
2001
+ const promotedStr = toPromptString(promoted);
2002
+ const ctx = {
2003
+ runId: opts.runId,
2004
+ target: opts.target,
2005
+ decision: "pr_opened",
2006
+ clusters: clusterReport.clusters,
2007
+ actedOnCluster: actedOn,
2008
+ observedRunCount: observedRuns.length,
2009
+ observedFeedbackCount: observedFeedback.length,
2010
+ evolution,
2011
+ release,
2012
+ gate,
2013
+ baselinePromptString: baselineStr,
2014
+ promotedPromptString: promotedStr
2015
+ };
2016
+ const renderBody = opts.ship.renderBody ?? defaultRenderBody;
2017
+ const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
2018
+ `);
2019
+ const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
2020
+ const pr = await proposeAutomatedPullRequest(opts.ship.client, {
2021
+ repo: opts.ship.repo,
2022
+ baseBranch: opts.ship.baseBranch ?? "main",
2023
+ branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
2024
+ title: `${opts.target}: production-loop prompt update (${opts.runId})`,
2025
+ body: renderBody(ctx),
2026
+ reviewers: opts.ship.reviewers,
2027
+ labels: opts.ship.labels,
2028
+ fileChanges: [
2029
+ {
2030
+ path: opts.ship.promptFilePath,
2031
+ contents: renderFile(promotedStr, currentFile),
2032
+ rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
2033
+ }
2034
+ ],
2035
+ dryRun: opts.ship.dryRun
2036
+ });
2037
+ return finalize({
2038
+ opts,
2039
+ decision: "pr_opened",
2040
+ startedAt,
2041
+ now,
2042
+ observedRunCount: observedRuns.length,
2043
+ observedFeedbackCount: observedFeedback.length,
2044
+ clusters: clusterReport.clusters,
2045
+ actedOnCluster: actedOn,
2046
+ evolution,
2047
+ release,
2048
+ gate,
2049
+ promotedPrompt: promoted,
2050
+ pullRequest: pr
2051
+ });
2052
+ }
2053
+ function finalize(args) {
2054
+ return {
2055
+ runId: args.opts.runId,
2056
+ target: args.opts.target,
2057
+ decision: args.decision,
2058
+ startedAt: args.startedAt,
2059
+ finishedAt: args.now().toISOString(),
2060
+ observedRunCount: args.observedRunCount,
2061
+ observedFeedbackCount: args.observedFeedbackCount,
2062
+ clusters: args.clusters,
2063
+ actedOnCluster: args.actedOnCluster,
2064
+ evolution: args.evolution,
2065
+ release: args.release,
2066
+ gate: args.gate,
2067
+ baselinePrompt: args.opts.evolve.baselinePrompt,
2068
+ promotedPrompt: args.promotedPrompt,
2069
+ pullRequest: args.pullRequest,
2070
+ cron: args.opts.cron ?? null
2071
+ };
2072
+ }
2073
+ function validate2(opts) {
2074
+ if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
2075
+ if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
2076
+ if (opts.evolve.holdoutScenarios.length === 0) {
2077
+ throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
2078
+ }
2079
+ if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
2080
+ throw new ValidationError(
2081
+ "runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
2082
+ );
2083
+ }
2084
+ if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
2085
+ }
2086
+ if (opts.ship) {
2087
+ if (!opts.ship.branchPrefix.trim()) {
2088
+ throw new ValidationError("runProductionLoop: ship.branchPrefix required");
2089
+ }
2090
+ if (!opts.ship.promptFilePath.trim()) {
2091
+ throw new ValidationError("runProductionLoop: ship.promptFilePath required");
2092
+ }
2093
+ }
2094
+ }
2095
+ function uniqueIds(ids) {
2096
+ const seen = /* @__PURE__ */ new Set();
2097
+ const out = [];
2098
+ for (const id of ids) {
2099
+ if (seen.has(id)) continue;
2100
+ seen.add(id);
2101
+ out.push(id);
2102
+ }
2103
+ return out;
2104
+ }
2105
+ function deriveSearchScenarios(holdout) {
2106
+ if (holdout.length < 4) {
2107
+ return [
2108
+ {
2109
+ ...holdout[0],
2110
+ id: `${holdout[0].id}__search`
2111
+ }
2112
+ ];
2113
+ }
2114
+ return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
2115
+ }
2116
+ function syntheticRunRecord(input) {
2117
+ const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
2118
+ return {
2119
+ runId: input.runId,
2120
+ experimentId: input.target,
2121
+ candidateId: input.variant.id,
2122
+ seed: input.seed,
2123
+ model: "production-loop@synthetic",
2124
+ promptHash: "0".repeat(64),
2125
+ configHash: "0".repeat(64),
2126
+ commitSha: "0".repeat(40),
2127
+ wallMs: input.trial.durationMs ?? 1,
2128
+ costUsd: input.trial.cost ?? 0,
2129
+ tokenUsage: { input: 0, output: 0 },
2130
+ outcome: {
2131
+ [scoreKey]: input.trial.score,
2132
+ raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
2133
+ },
2134
+ splitTag: input.split,
2135
+ scenarioId: input.scenarioId
2136
+ };
2137
+ }
2138
+ function toPromptString(payload) {
2139
+ if (typeof payload === "string") return payload;
2140
+ if (payload == null) return "";
2141
+ try {
2142
+ return JSON.stringify(payload, null, 2);
2143
+ } catch {
2144
+ return String(payload);
2145
+ }
2146
+ }
2147
+ function defaultRenderBody(ctx) {
2148
+ const cluster = ctx.actedOnCluster;
2149
+ const release = ctx.release;
2150
+ const gate = ctx.gate;
2151
+ const lines = [];
2152
+ lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
2153
+ lines.push("");
2154
+ lines.push(`Run id: \`${ctx.runId}\``);
2155
+ lines.push(`Decision: \`${ctx.decision}\``);
2156
+ lines.push(
2157
+ `Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
2158
+ );
2159
+ lines.push("");
2160
+ if (cluster) {
2161
+ lines.push("### Triggering failure cluster");
2162
+ lines.push("");
2163
+ lines.push(`- **class**: \`${cluster.failureClass}\``);
2164
+ lines.push(`- **runs in cluster**: ${cluster.runCount}`);
2165
+ lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
2166
+ if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
2167
+ if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
2168
+ if (cluster.exampleError) {
2169
+ lines.push(
2170
+ `- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
2171
+ );
2172
+ }
2173
+ lines.push("");
2174
+ }
2175
+ if (gate) {
2176
+ lines.push("### Held-out promotion gate");
2177
+ lines.push("");
2178
+ lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
2179
+ lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
2180
+ lines.push(
2181
+ `- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
2182
+ );
2183
+ lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
2184
+ lines.push(
2185
+ `- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
2186
+ );
2187
+ lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
2188
+ lines.push("");
2189
+ }
2190
+ if (release) {
2191
+ lines.push("### Release confidence");
2192
+ lines.push("");
2193
+ lines.push(`- **status**: \`${release.status}\``);
2194
+ lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
2195
+ lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
2196
+ if (release.issues.length > 0) {
2197
+ lines.push("- **issues**:");
2198
+ for (const issue of release.issues) {
2199
+ lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
2200
+ }
2201
+ }
2202
+ lines.push("");
2203
+ }
2204
+ lines.push("### Prompt diff");
2205
+ lines.push("");
2206
+ lines.push("```diff");
2207
+ lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
2208
+ lines.push("```");
2209
+ return lines.join("\n");
2210
+ }
2211
+ function unifiedDiff(a, b) {
2212
+ const aLines = a.split("\n");
2213
+ const bLines = b.split("\n");
2214
+ const out = [];
2215
+ const max = Math.max(aLines.length, bLines.length);
2216
+ for (let i = 0; i < max; i++) {
2217
+ const al = aLines[i];
2218
+ const bl = bLines[i];
2219
+ if (al === bl) continue;
2220
+ if (al !== void 0) out.push(`- ${al}`);
2221
+ if (bl !== void 0) out.push(`+ ${bl}`);
2222
+ }
2223
+ return out.join("\n");
2224
+ }
2225
+
1537
2226
  // src/registry.ts
1538
2227
  var ScenarioRegistry = class {
1539
2228
  scenarios = [];
@@ -8417,6 +9106,7 @@ export {
8417
9106
  formatDriverReport,
8418
9107
  formatFindings,
8419
9108
  gainHistogram,
9109
+ ghCliClient,
8420
9110
  precision as goldenPrecision,
8421
9111
  gradeSemanticStatus,
8422
9112
  groupBy,
@@ -8424,6 +9114,7 @@ export {
8424
9114
  hashJson,
8425
9115
  hashScenarios,
8426
9116
  htmlContainsElement,
9117
+ httpGithubClient,
8427
9118
  inMemoryReferenceReplayStore,
8428
9119
  inMemoryReviewStore,
8429
9120
  integrationAsi,
@@ -8484,6 +9175,7 @@ export {
8484
9175
  printDriverSummary,
8485
9176
  probeLlm,
8486
9177
  promptBisect,
9178
+ proposeAutomatedPullRequest,
8487
9179
  proposeSynthesisTargets,
8488
9180
  providerFromBaseUrl,
8489
9181
  pytestTestParser,
@@ -8528,6 +9220,7 @@ export {
8528
9220
  runKeywordCoverageJudgeUrl,
8529
9221
  runLiveProof,
8530
9222
  runMultiShotOptimization,
9223
+ runProductionLoop,
8531
9224
  runPromptEvolution,
8532
9225
  runProposeReview,
8533
9226
  runProposeReviewAsControlLoop,