@tangle-network/agent-eval 0.41.0 → 0.43.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/dist/benchmarks/index.js +2 -2
  2. package/dist/builder-eval/index.js +1 -1
  3. package/dist/campaign/index.d.ts +90 -368
  4. package/dist/campaign/index.js +74 -4
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  7. package/dist/chunk-H4TOS272.js.map +1 -0
  8. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  9. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  10. package/dist/{chunk-6QDKWHLS.js → chunk-MHQPVHXU.js} +2 -2
  11. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  12. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  13. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  14. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  15. package/dist/chunk-NSBPE2FW.js +17 -0
  16. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  17. package/dist/{chunk-YNMCYUWT.js → chunk-RXK7FXLV.js} +92 -37
  18. package/dist/chunk-RXK7FXLV.js.map +1 -0
  19. package/dist/cli.js +1 -1
  20. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +1 -1
  23. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  24. package/dist/governance/index.js +1 -1
  25. package/dist/index.d.ts +227 -687
  26. package/dist/index.js +755 -1239
  27. package/dist/index.js.map +1 -1
  28. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  29. package/dist/knowledge/index.js +1 -1
  30. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  31. package/dist/matrix/index.js +1 -1
  32. package/dist/meta-eval/index.js +1 -1
  33. package/dist/multishot/index.js +1 -1
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.js +68 -4
  36. package/dist/pipelines/index.js.map +1 -1
  37. package/dist/prm/index.js +1 -1
  38. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  39. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  40. package/dist/reporting.d.ts +2 -3
  41. package/dist/reporting.js +5 -9
  42. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  43. package/dist/rl.d.ts +103 -221
  44. package/dist/rl.js +45 -200
  45. package/dist/rl.js.map +1 -1
  46. package/dist/{run-campaign-KEJK5KFT.js → run-campaign-GNDO66B4.js} +3 -3
  47. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  48. package/dist/telemetry/file.js +1 -1
  49. package/dist/telemetry/index.js +1 -1
  50. package/dist/traces.d.ts +3 -2
  51. package/dist/traces.js +6 -6
  52. package/dist/types-BLbRTxoc.d.ts +367 -0
  53. package/dist/wire/index.d.ts +1 -1
  54. package/dist/wire/index.js +1 -1
  55. package/package.json +26 -17
  56. package/dist/chunk-5U2DOJU4.js.map +0 -1
  57. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  58. package/dist/chunk-DMW5VENN.js +0 -1412
  59. package/dist/chunk-DMW5VENN.js.map +0 -1
  60. package/dist/chunk-EGIPWXHL.js.map +0 -1
  61. package/dist/chunk-MAZ26DC7.js +0 -99
  62. package/dist/chunk-MAZ26DC7.js.map +0 -1
  63. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  64. package/dist/chunk-PZ5AY32C.js +0 -10
  65. package/dist/chunk-YNMCYUWT.js.map +0 -1
  66. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  67. package/dist/optimization.d.ts +0 -11
  68. package/dist/optimization.js +0 -71
  69. package/dist/run-campaign-KEJK5KFT.js.map +0 -1
  70. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  71. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  72. /package/dist/{chunk-6QDKWHLS.js.map → chunk-MHQPVHXU.js.map} +0 -0
  73. /package/dist/{chunk-PZ5AY32C.js.map → chunk-NSBPE2FW.js.map} +0 -0
  74. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
  75. /package/dist/{optimization.js.map → run-campaign-GNDO66B4.js.map} +0 -0
package/dist/index.js CHANGED
@@ -1,28 +1,30 @@
1
1
  import {
2
+ DEFAULT_MUTATION_PRIMITIVES,
2
3
  DEFAULT_RED_TEAM_CORPUS,
3
4
  Dataset,
4
5
  HoldoutLockedError,
6
+ buildReflectionPrompt,
5
7
  hashScenarios,
8
+ parseReflectionResponse,
6
9
  redTeamDataset,
7
10
  redTeamReport,
8
11
  runCanaries,
9
12
  scoreRedTeamOutput,
10
13
  toolNamesForRun
11
- } from "./chunk-5U2DOJU4.js";
14
+ } from "./chunk-N4SBKEPJ.js";
12
15
  import {
13
16
  BENCHMARK_SPLIT_SEED,
14
17
  benchmarks_exports,
15
18
  deterministicSplit
16
- } from "./chunk-6QDKWHLS.js";
19
+ } from "./chunk-MHQPVHXU.js";
17
20
  import {
18
21
  DEFAULT_RULES,
19
22
  classifyFailure,
20
23
  compareToBaseline,
21
24
  computeToolUseMetrics,
22
- failureClusterView,
23
25
  iqr,
24
26
  welchsTTest
25
- } from "./chunk-AU2JLNSZ.js";
27
+ } from "./chunk-H4TOS272.js";
26
28
  import {
27
29
  exportTrainingData,
28
30
  toNdjson
@@ -74,63 +76,16 @@ import {
74
76
  stopOnRepeatedAction,
75
77
  subjectiveEval
76
78
  } from "./chunk-NCRFYPS3.js";
77
- import {
78
- CallbackResearcher,
79
- DEFAULT_MUTATION_PRIMITIVES,
80
- FileSystemFeedbackTrajectoryStore,
81
- HeldOutGate,
82
- InMemoryFeedbackTrajectoryStore,
83
- InMemoryTrialCache,
84
- NoopResearcher,
85
- assignFeedbackSplit,
86
- buildReflectionPrompt,
87
- controlRunToFeedbackTrajectory,
88
- createFeedbackTrajectory,
89
- crowdingDistance,
90
- defaultMultiShotObjectives,
91
- dominates,
92
- feedbackTrajectoriesToDatasetScenarios,
93
- feedbackTrajectoriesToOptimizerRows,
94
- feedbackTrajectoryToDatasetScenario,
95
- feedbackTrajectoryToOptimizerRow,
96
- paretoFrontier,
97
- paretoFrontierWithCrowding,
98
- parseFeedbackTrajectoriesJsonl,
99
- parseReflectionResponse,
100
- renderPreferenceMemoryMarkdown,
101
- replayFeedbackTrajectories,
102
- replayFeedbackTrajectory,
103
- runMultiShotOptimization,
104
- runPromptEvolution,
105
- scalarScore,
106
- serializeFeedbackTrajectoriesJsonl,
107
- summarizePreferenceMemory,
108
- trialTraceFromMultiShotTrial,
109
- withAssignedFeedbackSplit
110
- } from "./chunk-DMW5VENN.js";
111
79
  import {
112
80
  assertReleaseConfidence,
113
81
  bootstrapCi,
114
82
  evaluateReleaseConfidence,
115
83
  judgeReplayGate,
116
- releaseTraceEvidenceFromMultiShotTrials,
117
84
  renderReleaseReport
118
- } from "./chunk-NKLGKF2Q.js";
85
+ } from "./chunk-KQ26DYTQ.js";
119
86
  import {
120
87
  runEvalCampaign
121
- } from "./chunk-LCIDRYGP.js";
122
- import {
123
- LlmCallError,
124
- LlmClient,
125
- LlmRouteAssertionError,
126
- assertLlmRoute,
127
- backoffMs,
128
- callLlm,
129
- callLlmJson,
130
- isTransientLlmError,
131
- probeLlm,
132
- stripFencedJson
133
- } from "./chunk-VXNVVBZO.js";
88
+ } from "./chunk-PD3MH6WU.js";
134
89
  import {
135
90
  AGENT_PROFILE_KINDS,
136
91
  AgentProfileCellValidationError,
@@ -150,17 +105,15 @@ import {
150
105
  validateRunRecord,
151
106
  verifyAgentProfileCell
152
107
  } from "./chunk-BWZEGTES.js";
153
- import {
154
- evaluateInterimReleaseConfidence,
155
- pairedEvalueSequence
156
- } from "./chunk-MAZ26DC7.js";
157
108
  import {
158
109
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
110
+ evaluateInterimReleaseConfidence,
159
111
  gainHistogram,
112
+ pairedEvalueSequence,
160
113
  paretoChart,
161
114
  researchReport,
162
115
  summaryTable
163
- } from "./chunk-EGIPWXHL.js";
116
+ } from "./chunk-MNL6LXGQ.js";
164
117
  import {
165
118
  benjaminiHochberg,
166
119
  bonferroni,
@@ -250,13 +203,6 @@ import {
250
203
  assertRunCaptured,
251
204
  throwIfRunIncomplete
252
205
  } from "./chunk-UBPIXOC4.js";
253
- import {
254
- FileSystemRawProviderSink,
255
- InMemoryRawProviderSink,
256
- NoopRawProviderSink,
257
- defaultProviderRedactor,
258
- providerFromBaseUrl
259
- } from "./chunk-PC4UYEBM.js";
260
206
  import {
261
207
  TraceEmitter,
262
208
  llmSpanFromProvider
@@ -268,6 +214,25 @@ import {
268
214
  signManifest,
269
215
  verifyManifest
270
216
  } from "./chunk-VSMTAMNK.js";
217
+ import {
218
+ LlmCallError,
219
+ LlmClient,
220
+ LlmRouteAssertionError,
221
+ assertLlmRoute,
222
+ backoffMs,
223
+ callLlm,
224
+ callLlmJson,
225
+ isTransientLlmError,
226
+ probeLlm,
227
+ stripFencedJson
228
+ } from "./chunk-VXNVVBZO.js";
229
+ import {
230
+ FileSystemRawProviderSink,
231
+ InMemoryRawProviderSink,
232
+ NoopRawProviderSink,
233
+ defaultProviderRedactor,
234
+ providerFromBaseUrl
235
+ } from "./chunk-PC4UYEBM.js";
271
236
  import {
272
237
  AgentEvalError,
273
238
  CaptureIntegrityError,
@@ -278,7 +243,7 @@ import {
278
243
  ValidationError,
279
244
  VerificationError
280
245
  } from "./chunk-QYJT52YW.js";
281
- import "./chunk-PZ5AY32C.js";
246
+ import "./chunk-NSBPE2FW.js";
282
247
 
283
248
  // src/run-score.ts
284
249
  var DEFAULT_RUN_SCORE_WEIGHTS = {
@@ -853,8 +818,8 @@ function createJudgeAdapter(opts) {
853
818
  cost: opts.cost ?? { kind: "llm" },
854
819
  version: `judge-${ADAPTER_REV}`,
855
820
  async analyze(input) {
856
- const scores = await opts.judge(opts.tcloud, input);
857
- return scores.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
821
+ const scores2 = await opts.judge(opts.tcloud, input);
822
+ return scores2.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
858
823
  }
859
824
  };
860
825
  }
@@ -2203,10 +2168,10 @@ function ghCliClient(opts = {}) {
2203
2168
  await exec("git", ["branch", "-D", input.branchName], { cwd });
2204
2169
  await run("git", ["checkout", "-b", input.branchName]);
2205
2170
  const { mkdir, writeFile } = await import("fs/promises");
2206
- const { dirname: dirname6, join: join4, resolve } = await import("path");
2171
+ const { dirname: dirname4, join: join4, resolve } = await import("path");
2207
2172
  for (const change of input.fileChanges) {
2208
2173
  const abs = resolve(cwd, change.path);
2209
- await mkdir(dirname6(abs), { recursive: true });
2174
+ await mkdir(dirname4(abs), { recursive: true });
2210
2175
  await writeFile(abs, change.contents, "utf8");
2211
2176
  await run("git", ["add", join4(change.path)]);
2212
2177
  }
@@ -2404,8 +2369,8 @@ async function executeScenario(tc, scenario, config) {
2404
2369
  console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
2405
2370
  await new Promise((r) => setTimeout(r, wait));
2406
2371
  }
2407
- const scores = await judge(tc, judgeInput);
2408
- judgeResults.push(scores);
2372
+ const scores2 = await judge(tc, judgeInput);
2373
+ judgeResults.push(scores2);
2409
2374
  await new Promise((r) => setTimeout(r, 3e3));
2410
2375
  break;
2411
2376
  } catch (err) {
@@ -3118,200 +3083,400 @@ ${lastResponse}` : "No conversation yet. Send your opening message \u2014 in cha
3118
3083
  return content.trim();
3119
3084
  }
3120
3085
 
3121
- // src/integration-gates.ts
3122
- function integrationManifestValidatedPayload(input) {
3086
+ // src/feedback-trajectory.ts
3087
+ var DEFAULT_SPLIT_POLICY = {
3088
+ trainPct: 70,
3089
+ devPct: 15,
3090
+ testPct: 10,
3091
+ holdoutPct: 5
3092
+ };
3093
+ var InMemoryFeedbackTrajectoryStore = class {
3094
+ trajectories = /* @__PURE__ */ new Map();
3095
+ async save(trajectory) {
3096
+ this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
3097
+ }
3098
+ async get(id) {
3099
+ const trajectory = this.trajectories.get(id);
3100
+ return trajectory ? cloneTrajectory(trajectory) : null;
3101
+ }
3102
+ async list(filter = {}) {
3103
+ return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
3104
+ }
3105
+ async appendAttempt(id, attempt) {
3106
+ const trajectory = this.trajectories.get(id);
3107
+ if (!trajectory)
3108
+ throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
3109
+ const next = cloneTrajectory({
3110
+ ...trajectory,
3111
+ attempts: [...trajectory.attempts, attempt],
3112
+ updatedAt: attempt.createdAt
3113
+ });
3114
+ this.trajectories.set(id, next);
3115
+ return cloneTrajectory(next);
3116
+ }
3117
+ async appendLabel(id, label, attemptId) {
3118
+ const trajectory = this.trajectories.get(id);
3119
+ if (!trajectory)
3120
+ throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
3121
+ const attempts = attemptId ? trajectory.attempts.map(
3122
+ (attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt
3123
+ ) : trajectory.attempts;
3124
+ const next = cloneTrajectory({
3125
+ ...trajectory,
3126
+ attempts,
3127
+ labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
3128
+ updatedAt: label.createdAt
3129
+ });
3130
+ this.trajectories.set(id, next);
3131
+ return cloneTrajectory(next);
3132
+ }
3133
+ };
3134
+ var FileSystemFeedbackTrajectoryStore = class {
3135
+ dir;
3136
+ memory = new InMemoryFeedbackTrajectoryStore();
3137
+ loaded = false;
3138
+ constructor(options) {
3139
+ this.dir = options.dir;
3140
+ }
3141
+ async save(trajectory) {
3142
+ await this.load();
3143
+ await this.memory.save(trajectory);
3144
+ await this.append({ op: "save", trajectory });
3145
+ }
3146
+ async get(id) {
3147
+ await this.load();
3148
+ return this.memory.get(id);
3149
+ }
3150
+ async list(filter = {}) {
3151
+ await this.load();
3152
+ return this.memory.list(filter);
3153
+ }
3154
+ async appendAttempt(id, attempt) {
3155
+ await this.load();
3156
+ const next = await this.memory.appendAttempt(id, attempt);
3157
+ await this.append({ op: "appendAttempt", id, attempt });
3158
+ return next;
3159
+ }
3160
+ async appendLabel(id, label, attemptId) {
3161
+ await this.load();
3162
+ const next = await this.memory.appendLabel(id, label, attemptId);
3163
+ await this.append({ op: "appendLabel", id, label, attemptId });
3164
+ return next;
3165
+ }
3166
+ async append(record) {
3167
+ const { appendFile, mkdir } = await import("fs/promises");
3168
+ const { join: join4 } = await import("path");
3169
+ await mkdir(this.dir, { recursive: true });
3170
+ await appendFile(
3171
+ join4(this.dir, "feedback-trajectories.ndjson"),
3172
+ `${JSON.stringify(record)}
3173
+ `,
3174
+ "utf8"
3175
+ );
3176
+ }
3177
+ async load() {
3178
+ if (this.loaded) return;
3179
+ const { readFile } = await import("fs/promises");
3180
+ const { join: join4 } = await import("path");
3181
+ const file = join4(this.dir, "feedback-trajectories.ndjson");
3182
+ try {
3183
+ const raw = await readFile(file, "utf8");
3184
+ for (const line of raw.split("\n")) {
3185
+ if (!line.trim()) continue;
3186
+ try {
3187
+ const record = JSON.parse(line);
3188
+ if (record.op === "save") await this.memory.save(record.trajectory);
3189
+ if (record.op === "appendAttempt")
3190
+ await this.memory.appendAttempt(record.id, record.attempt);
3191
+ if (record.op === "appendLabel")
3192
+ await this.memory.appendLabel(record.id, record.label, record.attemptId);
3193
+ } catch {
3194
+ }
3195
+ }
3196
+ } catch {
3197
+ }
3198
+ this.loaded = true;
3199
+ }
3200
+ };
3201
+ function createFeedbackTrajectory(input) {
3202
+ const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
3203
+ const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
3123
3204
  return {
3124
- kind: "integration_manifest_validated",
3125
- connectorId: input.connectorId,
3126
- ...input.actionId ? { actionId: input.actionId } : {},
3127
- valid: input.valid,
3128
- ...input.reason ? { reason: input.reason } : {},
3129
- ...input.metadata ? { metadata: input.metadata } : {}
3205
+ id,
3206
+ projectId: input.projectId,
3207
+ scenarioId: input.scenarioId,
3208
+ task: input.task,
3209
+ attempts: input.attempts ?? [],
3210
+ labels: input.labels ?? [],
3211
+ outcome: input.outcome,
3212
+ split: input.split,
3213
+ tags: input.tags,
3214
+ createdAt,
3215
+ metadata: input.metadata
3130
3216
  };
3131
3217
  }
3132
- function integrationManifestResolvedPayload(input) {
3133
- const missingConnections = input.missingConnections ?? [];
3134
- const missingScopes = input.missingScopes ?? [];
3135
- const requiredScopes = input.requiredScopes ?? [];
3136
- const status = input.status ?? statusForManifest(input);
3218
+ function assignFeedbackSplit(trajectory, policy = {}) {
3219
+ const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
3220
+ const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
3221
+ if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
3222
+ const bucket = stableHash(
3223
+ `${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`
3224
+ ) % total;
3225
+ if (bucket < split.trainPct) return "train";
3226
+ if (bucket < split.trainPct + split.devPct) return "dev";
3227
+ if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
3228
+ return "holdout";
3229
+ }
3230
+ function withAssignedFeedbackSplit(trajectory, policy) {
3137
3231
  return {
3138
- kind: "integration_manifest_resolved",
3139
- connectorId: input.connectorId,
3140
- ...input.actionId ? { actionId: input.actionId } : {},
3141
- status,
3142
- missingConnections,
3143
- missingScopes,
3144
- requiredScopes,
3145
- missing: resolutionMissingItems(input, missingConnections, missingScopes, requiredScopes),
3146
- optionalMissing: [],
3147
- ready: status === "ready" ? [
3148
- {
3149
- status: "ready",
3150
- connectorId: input.connectorId,
3151
- ...input.actionId ? { actionId: input.actionId } : {},
3152
- requiredScopes
3153
- }
3154
- ] : [],
3155
- approvalRequired: input.approvalRequired ?? false,
3156
- ...input.reason ? { reason: input.reason } : {},
3157
- ...input.metadata ? { metadata: input.metadata } : {}
3232
+ ...trajectory,
3233
+ split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
3158
3234
  };
3159
3235
  }
3160
- function integrationInvokeFailedPayload(input) {
3236
+ function feedbackTrajectoryToDatasetScenario(trajectory) {
3237
+ const withSplit = withAssignedFeedbackSplit(trajectory);
3161
3238
  return {
3162
- kind: "integration_invoke_failed",
3163
- connectorId: input.connectorId,
3164
- actionId: input.actionId,
3165
- code: input.code,
3166
- message: input.message,
3167
- ...input.status !== void 0 ? { status: input.status } : {},
3168
- ...input.retryable !== void 0 ? { retryable: input.retryable } : {},
3169
- ...input.metadata ? { metadata: input.metadata } : {}
3239
+ id: withSplit.scenarioId ?? withSplit.id,
3240
+ split: withSplit.split,
3241
+ payload: withSplit,
3242
+ tags: {
3243
+ ...withSplit.projectId ? { projectId: withSplit.projectId } : {},
3244
+ ...withSplit.tags ?? {},
3245
+ source: "feedback-trajectory"
3246
+ }
3170
3247
  };
3171
3248
  }
3172
- function integrationGateEvals(input) {
3173
- const evals = [];
3174
- evals.push(
3175
- objectiveEval({
3176
- id: `integration-manifest-valid:${input.connectorId}${input.actionId ? `:${input.actionId}` : ""}`,
3177
- passed: input.valid,
3178
- score: input.valid ? 1 : 0,
3179
- severity: input.valid ? "info" : "critical",
3180
- detail: input.valid ? "Integration manifest is valid." : input.reason ?? "Integration manifest is invalid.",
3181
- metadata: { integration: input }
3182
- })
3183
- );
3184
- const missingConnections = input.missingConnections ?? [];
3185
- evals.push(
3186
- objectiveEval({
3187
- id: `integration-connection-ready:${input.connectorId}`,
3188
- passed: missingConnections.length === 0,
3189
- score: missingConnections.length === 0 ? 1 : 0,
3190
- severity: missingConnections.length === 0 ? "info" : "critical",
3191
- detail: missingConnections.length === 0 ? "Required integration connections are present." : `Missing integration connection(s): ${missingConnections.join(", ")}`,
3192
- evidence: missingConnections.join(", ") || void 0,
3193
- metadata: { connectorId: input.connectorId, missingConnections }
3194
- })
3195
- );
3196
- const missingScopes = input.missingScopes ?? [];
3197
- evals.push(
3198
- objectiveEval({
3199
- id: `integration-scopes-ready:${input.connectorId}`,
3200
- passed: missingScopes.length === 0,
3201
- score: missingScopes.length === 0 ? 1 : 0,
3202
- severity: missingScopes.length === 0 ? "info" : "critical",
3203
- detail: missingScopes.length === 0 ? "Required integration scopes are granted." : `Missing integration scope(s): ${missingScopes.join(", ")}`,
3204
- evidence: missingScopes.join(", ") || void 0,
3205
- metadata: {
3206
- connectorId: input.connectorId,
3207
- missingScopes,
3208
- requiredScopes: input.requiredScopes ?? []
3209
- }
3210
- })
3211
- );
3212
- if (input.approvalRequired) {
3213
- evals.push(
3214
- objectiveEval({
3215
- id: `integration-approval-required:${input.connectorId}`,
3216
- passed: false,
3217
- score: 0,
3218
- severity: "warning",
3219
- detail: "Integration action requires approval before execution.",
3220
- metadata: { connectorId: input.connectorId, actionId: input.actionId }
3221
- })
3222
- );
3223
- }
3224
- return evals;
3249
+ function feedbackTrajectoriesToDatasetScenarios(trajectories) {
3250
+ return trajectories.map(feedbackTrajectoryToDatasetScenario);
3251
+ }
3252
+ function feedbackTrajectoryToOptimizerRow(trajectory) {
3253
+ const labels = allLabels(trajectory);
3254
+ return {
3255
+ scenarioId: trajectory.scenarioId ?? trajectory.id,
3256
+ trajectoryId: trajectory.id,
3257
+ labelKinds: [...new Set(labels.map((label) => label.kind))],
3258
+ score: trajectory.outcome?.score ?? scoreFromLabels(labels),
3259
+ metadata: {
3260
+ projectId: trajectory.projectId,
3261
+ split: trajectory.split,
3262
+ intent: trajectory.task.intent,
3263
+ attempts: trajectory.attempts.length,
3264
+ outcome: trajectory.outcome,
3265
+ labels
3266
+ }
3267
+ };
3268
+ }
3269
+ function feedbackTrajectoriesToOptimizerRows(trajectories) {
3270
+ return trajectories.map(feedbackTrajectoryToOptimizerRow);
3225
3271
  }
3226
- function integrationAsi(input) {
3227
- if ("code" in input) {
3272
+ async function replayFeedbackTrajectory(trajectory, adapter) {
3273
+ try {
3274
+ const result = await adapter.replay(trajectory);
3228
3275
  return {
3229
- expectationId: `integration-invoke:${input.connectorId}:${input.actionId}`,
3230
- message: input.message,
3231
- severity: severityForInvokeFailure(input.code),
3232
- responsibleSurface: surfaceForInvokeFailure(input.code),
3233
- suggestion: suggestionForInvokeFailure(input),
3234
- metadata: { integration: input }
3276
+ trajectoryId: trajectory.id,
3277
+ ...result
3278
+ };
3279
+ } catch (err) {
3280
+ const createdAt = (/* @__PURE__ */ new Date()).toISOString();
3281
+ const message = err instanceof Error ? err.message : String(err);
3282
+ return {
3283
+ trajectoryId: trajectory.id,
3284
+ pass: false,
3285
+ labels: [
3286
+ {
3287
+ source: "system",
3288
+ kind: "reject",
3289
+ value: false,
3290
+ reason: message,
3291
+ severity: "error",
3292
+ createdAt
3293
+ }
3294
+ ],
3295
+ outcome: {
3296
+ success: false,
3297
+ score: 0,
3298
+ detail: message,
3299
+ observedAt: createdAt
3300
+ },
3301
+ metadata: { replayError: true }
3235
3302
  };
3236
3303
  }
3237
- const missingConnections = input.missingConnections ?? [];
3238
- const missingScopes = input.missingScopes ?? [];
3239
- const surface = !input.valid ? "integration-manifest" : missingConnections.length > 0 ? "integration-connection" : missingScopes.length > 0 ? "integration-scope" : input.approvalRequired ? "integration-approval" : "integration-policy";
3240
- return {
3241
- expectationId: `integration-ready:${input.connectorId}${input.actionId ? `:${input.actionId}` : ""}`,
3242
- message: input.reason ?? messageForManifest(input),
3243
- severity: input.valid && missingConnections.length === 0 && missingScopes.length === 0 && !input.approvalRequired ? "info" : "error",
3244
- responsibleSurface: surface,
3245
- suggestion: suggestionForManifest(input),
3246
- metadata: { integration: input }
3247
- };
3248
3304
  }
3249
- function statusForManifest(input) {
3250
- if (input.approvalRequired) return "approval_required";
3251
- if (!input.valid || (input.missingConnections?.length ?? 0) > 0 || (input.missingScopes?.length ?? 0) > 0)
3252
- return "blocked";
3253
- return "ready";
3254
- }
3255
- function resolutionMissingItems(input, missingConnections, missingScopes, requiredScopes) {
3256
- const connectionItems = missingConnections.map((connectorId) => ({
3257
- status: "missing_connection",
3258
- connectorId,
3259
- ...input.actionId ? { actionId: input.actionId } : {},
3260
- requiredScopes
3261
- }));
3262
- if (missingScopes.length === 0) return connectionItems;
3263
- return [
3264
- ...connectionItems,
3265
- {
3266
- status: "missing_scope",
3267
- connectorId: input.connectorId,
3268
- ...input.actionId ? { actionId: input.actionId } : {},
3269
- missingScopes,
3270
- requiredScopes
3305
+ async function replayFeedbackTrajectories(trajectories, adapter) {
3306
+ const results = [];
3307
+ for (const trajectory of trajectories) {
3308
+ results.push(await replayFeedbackTrajectory(trajectory, adapter));
3309
+ }
3310
+ return results;
3311
+ }
3312
+ function summarizePreferenceMemory(trajectories, options = {}) {
3313
+ const maxEntries = options.maxEntries ?? 20;
3314
+ const entries = [];
3315
+ for (const trajectory of trajectories) {
3316
+ for (const label of allLabels(trajectory)) {
3317
+ const instruction = instructionFromLabel(trajectory, label);
3318
+ if (!instruction) continue;
3319
+ entries.push({
3320
+ instruction,
3321
+ rationale: label.reason ?? `${label.kind} label from ${label.source}`,
3322
+ weight: weightForLabel(label),
3323
+ sourceTrajectoryId: trajectory.id,
3324
+ sourceLabelId: label.id,
3325
+ category: label.kind
3326
+ });
3327
+ }
3328
+ }
3329
+ const byInstruction = /* @__PURE__ */ new Map();
3330
+ for (const entry of entries) {
3331
+ const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
3332
+ const existing = byInstruction.get(key);
3333
+ if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
3334
+ }
3335
+ return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
3336
+ }
3337
+ function renderPreferenceMemoryMarkdown(entries) {
3338
+ const lines = ["# Preference Memory", ""];
3339
+ for (const entry of entries) {
3340
+ lines.push(`- ${entry.instruction}`);
3341
+ lines.push(` Rationale: ${entry.rationale}`);
3342
+ lines.push(` Source: ${entry.sourceTrajectoryId}`);
3343
+ lines.push("");
3344
+ }
3345
+ return `${lines.join("\n").trim()}
3346
+ `;
3347
+ }
3348
+ function serializeFeedbackTrajectoriesJsonl(trajectories) {
3349
+ return `${trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize2(trajectory))).join("\n")}
3350
+ `;
3351
+ }
3352
+ function parseFeedbackTrajectoriesJsonl(jsonl) {
3353
+ const trajectories = [];
3354
+ for (const line of jsonl.split("\n")) {
3355
+ if (!line.trim()) continue;
3356
+ trajectories.push(JSON.parse(line));
3357
+ }
3358
+ return trajectories;
3359
+ }
3360
+ function controlRunToFeedbackTrajectory(run, options = {}) {
3361
+ const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
3362
+ const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
3363
+ return createFeedbackTrajectory({
3364
+ id: trajectoryId,
3365
+ projectId: options.projectId,
3366
+ scenarioId: options.scenarioId,
3367
+ task: { intent: run.intent },
3368
+ createdAt,
3369
+ attempts: run.steps.map((step) => ({
3370
+ id: `${trajectoryId}_step_${step.index}`,
3371
+ stepIndex: step.index,
3372
+ artifactType: options.artifactType ?? "action",
3373
+ artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
3374
+ proposedAction: options.proposedActionFromStep?.(step),
3375
+ evals: step.evalsAfter,
3376
+ createdAt: step.startedAt,
3377
+ metadata: {
3378
+ decision: step.decision,
3379
+ actionOutcome: step.actionOutcome
3380
+ }
3381
+ })),
3382
+ labels: [
3383
+ {
3384
+ source: "system",
3385
+ kind: run.pass ? "approve" : "reject",
3386
+ value: run.pass,
3387
+ reason: run.reason,
3388
+ severity: run.pass ? "info" : "error",
3389
+ createdAt
3390
+ }
3391
+ ],
3392
+ outcome: {
3393
+ success: run.pass,
3394
+ score: run.score,
3395
+ costUsd: run.spentCostUsd,
3396
+ detail: run.reason,
3397
+ observedAt: createdAt,
3398
+ metadata: {
3399
+ stoppedBy: run.stoppedBy,
3400
+ failureClass: run.failureClass
3401
+ }
3271
3402
  }
3403
+ });
3404
+ }
3405
+ function allLabels(trajectory) {
3406
+ const labels = [
3407
+ ...trajectory.labels,
3408
+ ...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
3272
3409
  ];
3410
+ const seen = /* @__PURE__ */ new Set();
3411
+ return labels.filter((label) => {
3412
+ const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
3413
+ if (seen.has(key)) return false;
3414
+ seen.add(key);
3415
+ return true;
3416
+ });
3417
+ }
3418
+ function scoreFromLabels(labels) {
3419
+ if (!labels.length) return void 0;
3420
+ const scored = labels.map((label) => {
3421
+ if (label.kind === "approve" || label.kind === "select") return 1;
3422
+ if (label.kind === "reject" || label.kind === "policy_block") return 0;
3423
+ if (label.kind === "rate" && typeof label.value === "number")
3424
+ return Math.max(0, Math.min(1, label.value));
3425
+ return void 0;
3426
+ }).filter((value) => typeof value === "number");
3427
+ if (!scored.length) return void 0;
3428
+ return Math.round(scored.reduce((sum3, value) => sum3 + value, 0) / scored.length * 1e3) / 1e3;
3429
+ }
3430
+ function instructionFromLabel(trajectory, label) {
3431
+ if (label.kind === "reject" && label.reason)
3432
+ return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
3433
+ if (label.kind === "revision_request" && label.reason)
3434
+ return `Revise similar work by applying: ${label.reason}`;
3435
+ if (label.kind === "select" && label.reason)
3436
+ return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
3437
+ if (label.kind === "approve" && label.reason)
3438
+ return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
3439
+ if (label.kind === "comment" && label.reason) return label.reason;
3440
+ return void 0;
3441
+ }
3442
+ function weightForLabel(label) {
3443
+ const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
3444
+ const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
3445
+ return severity * source;
3446
+ }
3447
+ function matchesFilter(trajectory, filter) {
3448
+ if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
3449
+ if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
3450
+ if (filter.split && trajectory.split !== filter.split) return false;
3451
+ if (filter.tag) {
3452
+ const [key, value] = filter.tag;
3453
+ if (trajectory.tags?.[key] !== value) return false;
3454
+ }
3455
+ return true;
3456
+ }
3457
+ function cloneTrajectory(trajectory) {
3458
+ return JSON.parse(JSON.stringify(trajectory));
3273
3459
  }
3274
- function surfaceForInvokeFailure(code) {
3275
- if (code === "auth_expired") return "integration-auth";
3276
- if (code === "scope_denied") return "integration-scope";
3277
- if (code === "approval_required") return "integration-approval";
3278
- if (code === "unsafe_write_denied") return "integration-policy";
3279
- if (code === "manifest_invalid") return "integration-manifest";
3280
- return "integration-provider";
3281
- }
3282
- function severityForInvokeFailure(code) {
3283
- return code === "provider_failure" ? "warning" : "error";
3284
- }
3285
- function suggestionForInvokeFailure(input) {
3286
- if (input.code === "auth_expired") return `Reconnect ${input.connectorId} before retrying.`;
3287
- if (input.code === "scope_denied")
3288
- return `Request the missing scope for ${input.connectorId}.${input.actionId}.`;
3289
- if (input.code === "approval_required")
3290
- return `Ask the user to approve ${input.connectorId}.${input.actionId}.`;
3291
- if (input.code === "unsafe_write_denied")
3292
- return `Route ${input.connectorId}.${input.actionId} through the write-approval policy.`;
3293
- if (input.code === "manifest_invalid")
3294
- return `Fix the integration manifest for ${input.connectorId}.${input.actionId}.`;
3295
- return `Retry or degrade gracefully after ${input.connectorId} provider failure.`;
3296
- }
3297
- function messageForManifest(input) {
3298
- if (!input.valid) return `Integration manifest for ${input.connectorId} is invalid.`;
3299
- if ((input.missingConnections?.length ?? 0) > 0)
3300
- return `Missing connection for ${input.connectorId}.`;
3301
- if ((input.missingScopes?.length ?? 0) > 0)
3302
- return `Missing required scopes for ${input.connectorId}.`;
3303
- if (input.approvalRequired)
3304
- return `Approval required for ${input.connectorId}${input.actionId ? `.${input.actionId}` : ""}.`;
3305
- return `${input.connectorId} is ready.`;
3306
- }
3307
- function suggestionForManifest(input) {
3308
- if (!input.valid) return "Fix or regenerate the integration manifest before running the agent.";
3309
- if ((input.missingConnections?.length ?? 0) > 0)
3310
- return `Connect ${input.missingConnections.join(", ")} before replaying the workflow.`;
3311
- if ((input.missingScopes?.length ?? 0) > 0)
3312
- return `Request scopes: ${input.missingScopes.join(", ")}.`;
3313
- if (input.approvalRequired) return "Create an approval request and replay after approval.";
3314
- return "No action required.";
3460
+ function compact(value, max) {
3461
+ const normalized = value.replace(/\s+/g, " ").trim();
3462
+ return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
3463
+ }
3464
+ function stableHash(input) {
3465
+ let hash = 2166136261;
3466
+ for (let i = 0; i < input.length; i += 1) {
3467
+ hash ^= input.charCodeAt(i);
3468
+ hash = Math.imul(hash, 16777619);
3469
+ }
3470
+ return hash >>> 0;
3471
+ }
3472
+ function canonicalize2(value) {
3473
+ if (value === null || typeof value !== "object") return value;
3474
+ if (Array.isArray(value)) return value.map(canonicalize2);
3475
+ const out = {};
3476
+ for (const key of Object.keys(value).sort()) {
3477
+ out[key] = canonicalize2(value[key]);
3478
+ }
3479
+ return out;
3315
3480
  }
3316
3481
 
3317
3482
  // src/integrity/backend-integrity.ts
@@ -3796,9 +3961,9 @@ function scorePrReviewComments(auditCase, comments, source, weights = {}) {
3796
3961
  })
3797
3962
  };
3798
3963
  }
3799
- function summarizePrReviewBenchmark(scores) {
3964
+ function summarizePrReviewBenchmark(scores2) {
3800
3965
  const bySource = /* @__PURE__ */ new Map();
3801
- for (const score of scores) {
3966
+ for (const score of scores2) {
3802
3967
  bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
3803
3968
  }
3804
3969
  return [...bySource.entries()].map(([source, sourceScores]) => ({
@@ -3913,396 +4078,6 @@ function sum(values) {
3913
4078
  return values.reduce((total, value) => total + value, 0);
3914
4079
  }
3915
4080
 
3916
- // src/production-loop.ts
3917
- async function runProductionLoop(opts) {
3918
- validate2(opts);
3919
- const now = opts.now ?? (() => /* @__PURE__ */ new Date());
3920
- const startedAt = now().toISOString();
3921
- const observedRuns = await opts.traceStore.listRuns();
3922
- const observedFeedback = await opts.feedbackStore.list();
3923
- const clusterReport = await failureClusterView(opts.traceStore, {
3924
- minClusterSize: opts.cluster.minClusterSize ?? 1
3925
- });
3926
- const minSize = opts.cluster.minClusterSize ?? 5;
3927
- const minSeverity = opts.cluster.minSeverityRatio ?? 0.05;
3928
- const maxClusters = opts.cluster.maxClustersPerCycle ?? 1;
3929
- const totalRuns = clusterReport.totalRuns;
3930
- const actionable = clusterReport.clusters.filter((c) => c.runCount >= minSize).filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity).slice(0, maxClusters);
3931
- if (actionable.length === 0) {
3932
- return finalize({
3933
- opts,
3934
- decision: "no_actionable_failures",
3935
- startedAt,
3936
- now,
3937
- observedRunCount: observedRuns.length,
3938
- observedFeedbackCount: observedFeedback.length,
3939
- clusters: clusterReport.clusters,
3940
- actedOnCluster: null,
3941
- evolution: null,
3942
- release: null,
3943
- gate: null,
3944
- promotedPrompt: opts.evolve.baselinePrompt,
3945
- pullRequest: null
3946
- });
3947
- }
3948
- const actedOn = actionable[0];
3949
- const baseline = {
3950
- id: opts.evolve.baselineId ?? "baseline",
3951
- label: opts.evolve.baselineId ?? "baseline",
3952
- generation: 0,
3953
- payload: opts.evolve.baselinePrompt
3954
- };
3955
- const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id));
3956
- const searchIds = uniqueIds(
3957
- (opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
3958
- (s) => s.id
3959
- )
3960
- );
3961
- if (searchIds.some((id) => holdoutIds.includes(id))) {
3962
- throw new ValidationError(
3963
- "runProductionLoop: searchScenarios and holdoutScenarios must be disjoint"
3964
- );
3965
- }
3966
- const reps = opts.evolve.reps ?? 3;
3967
- const generations = opts.evolve.generations ?? 3;
3968
- const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4);
3969
- const evolution = await runMultiShotOptimization({
3970
- runId: `${opts.runId}/evolve`,
3971
- target: opts.target,
3972
- seedVariants: [baseline],
3973
- searchScenarioIds: searchIds,
3974
- reps,
3975
- generations,
3976
- populationSize,
3977
- scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
3978
- runner: opts.evolve.runner,
3979
- scorer: opts.evolve.scorer,
3980
- mutateAdapter: opts.evolve.mutator,
3981
- gate: {
3982
- holdoutScenarioIds: holdoutIds,
3983
- reps,
3984
- gate: { ...opts.evolve.gate, baselineKey: baseline.id },
3985
- toRunRecord: opts.evolve.toRunRecord ?? (({ variant, scenarioId, rep, split, seed, trial }) => syntheticRunRecord({
3986
- runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
3987
- variant,
3988
- scenarioId,
3989
- rep,
3990
- split,
3991
- seed,
3992
- trial,
3993
- target: opts.target
3994
- }))
3995
- }
3996
- });
3997
- const gate = evolution.gate?.decision ?? null;
3998
- const promotedVariant = evolution.promotedVariant;
3999
- const promoted = promotedVariant.payload;
4000
- const promotedChanged = promotedVariant.id !== baseline.id;
4001
- const allTrials = evolution.evolution.generations.flatMap(
4002
- (g) => g.trials
4003
- );
4004
- const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials);
4005
- const releaseScenarios = [
4006
- ...(opts.evolve.searchScenarios ?? []).map((s) => ({
4007
- id: s.id,
4008
- payload: s,
4009
- split: "train",
4010
- tags: { persona: s.persona, label: s.label }
4011
- })),
4012
- ...opts.evolve.holdoutScenarios.map((s) => ({
4013
- id: s.id,
4014
- payload: s,
4015
- split: "holdout",
4016
- tags: { persona: s.persona, label: s.label }
4017
- }))
4018
- ];
4019
- const release = evaluateReleaseConfidence({
4020
- target: opts.target,
4021
- candidateId: promotedVariant.id,
4022
- baselineId: baseline.id,
4023
- scenarios: releaseScenarios,
4024
- traces: traceEvidence,
4025
- gateDecision: gate ?? void 0,
4026
- thresholds: opts.releaseThresholds,
4027
- runs: [...evolution.gate?.candidateRuns ?? [], ...evolution.gate?.baselineRuns ?? []]
4028
- });
4029
- if (!promotedChanged) {
4030
- return finalize({
4031
- opts,
4032
- decision: "evolve_yielded_no_improvement",
4033
- startedAt,
4034
- now,
4035
- observedRunCount: observedRuns.length,
4036
- observedFeedbackCount: observedFeedback.length,
4037
- clusters: clusterReport.clusters,
4038
- actedOnCluster: actedOn,
4039
- evolution,
4040
- release,
4041
- gate,
4042
- promotedPrompt: promoted,
4043
- pullRequest: null
4044
- });
4045
- }
4046
- if (release.status === "fail" || gate && !gate.promote) {
4047
- return finalize({
4048
- opts,
4049
- decision: "gate_failed",
4050
- startedAt,
4051
- now,
4052
- observedRunCount: observedRuns.length,
4053
- observedFeedbackCount: observedFeedback.length,
4054
- clusters: clusterReport.clusters,
4055
- actedOnCluster: actedOn,
4056
- evolution,
4057
- release,
4058
- gate,
4059
- promotedPrompt: promoted,
4060
- pullRequest: null
4061
- });
4062
- }
4063
- if (!opts.ship) {
4064
- return finalize({
4065
- opts,
4066
- decision: "proposed_change",
4067
- startedAt,
4068
- now,
4069
- observedRunCount: observedRuns.length,
4070
- observedFeedbackCount: observedFeedback.length,
4071
- clusters: clusterReport.clusters,
4072
- actedOnCluster: actedOn,
4073
- evolution,
4074
- release,
4075
- gate,
4076
- promotedPrompt: promoted,
4077
- pullRequest: null
4078
- });
4079
- }
4080
- const baselineStr = toPromptString(baseline.payload);
4081
- const promotedStr = toPromptString(promoted);
4082
- const ctx = {
4083
- runId: opts.runId,
4084
- target: opts.target,
4085
- decision: "pr_opened",
4086
- clusters: clusterReport.clusters,
4087
- actedOnCluster: actedOn,
4088
- observedRunCount: observedRuns.length,
4089
- observedFeedbackCount: observedFeedback.length,
4090
- evolution,
4091
- release,
4092
- gate,
4093
- baselinePromptString: baselineStr,
4094
- promotedPromptString: promotedStr
4095
- };
4096
- const renderBody = opts.ship.renderBody ?? defaultRenderBody;
4097
- const renderFile = opts.ship.renderPromptFile ?? ((next, _prev) => `${next}
4098
- `);
4099
- const currentFile = opts.ship.readCurrentPromptFile ? await opts.ship.readCurrentPromptFile() : null;
4100
- const pr = await proposeAutomatedPullRequest(opts.ship.client, {
4101
- repo: opts.ship.repo,
4102
- baseBranch: opts.ship.baseBranch ?? "main",
4103
- branchName: `${opts.ship.branchPrefix.replace(/\/+$/, "")}/${opts.runId}`,
4104
- title: `${opts.target}: production-loop prompt update (${opts.runId})`,
4105
- body: renderBody(ctx),
4106
- reviewers: opts.ship.reviewers,
4107
- labels: opts.ship.labels,
4108
- fileChanges: [
4109
- {
4110
- path: opts.ship.promptFilePath,
4111
- contents: renderFile(promotedStr, currentFile),
4112
- rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`
4113
- }
4114
- ],
4115
- dryRun: opts.ship.dryRun
4116
- });
4117
- return finalize({
4118
- opts,
4119
- decision: "pr_opened",
4120
- startedAt,
4121
- now,
4122
- observedRunCount: observedRuns.length,
4123
- observedFeedbackCount: observedFeedback.length,
4124
- clusters: clusterReport.clusters,
4125
- actedOnCluster: actedOn,
4126
- evolution,
4127
- release,
4128
- gate,
4129
- promotedPrompt: promoted,
4130
- pullRequest: pr
4131
- });
4132
- }
4133
- function finalize(args) {
4134
- return {
4135
- runId: args.opts.runId,
4136
- target: args.opts.target,
4137
- decision: args.decision,
4138
- startedAt: args.startedAt,
4139
- finishedAt: args.now().toISOString(),
4140
- observedRunCount: args.observedRunCount,
4141
- observedFeedbackCount: args.observedFeedbackCount,
4142
- clusters: args.clusters,
4143
- actedOnCluster: args.actedOnCluster,
4144
- evolution: args.evolution,
4145
- release: args.release,
4146
- gate: args.gate,
4147
- baselinePrompt: args.opts.evolve.baselinePrompt,
4148
- promotedPrompt: args.promotedPrompt,
4149
- pullRequest: args.pullRequest,
4150
- cron: args.opts.cron ?? null
4151
- };
4152
- }
4153
- function validate2(opts) {
4154
- if (!opts.runId.trim()) throw new ValidationError("runProductionLoop: runId required");
4155
- if (!opts.target.trim()) throw new ValidationError("runProductionLoop: target required");
4156
- if (opts.evolve.holdoutScenarios.length === 0) {
4157
- throw new ValidationError("runProductionLoop: evolve.holdoutScenarios must not be empty");
4158
- }
4159
- if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
4160
- throw new ValidationError(
4161
- "runProductionLoop: evolve.searchScenarios must be omitted or non-empty"
4162
- );
4163
- }
4164
- if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
4165
- }
4166
- if (opts.ship) {
4167
- if (!opts.ship.branchPrefix.trim()) {
4168
- throw new ValidationError("runProductionLoop: ship.branchPrefix required");
4169
- }
4170
- if (!opts.ship.promptFilePath.trim()) {
4171
- throw new ValidationError("runProductionLoop: ship.promptFilePath required");
4172
- }
4173
- }
4174
- }
4175
- function uniqueIds(ids) {
4176
- const seen = /* @__PURE__ */ new Set();
4177
- const out = [];
4178
- for (const id of ids) {
4179
- if (seen.has(id)) continue;
4180
- seen.add(id);
4181
- out.push(id);
4182
- }
4183
- return out;
4184
- }
4185
- function deriveSearchScenarios(holdout) {
4186
- if (holdout.length < 4) {
4187
- return [
4188
- {
4189
- ...holdout[0],
4190
- id: `${holdout[0].id}__search`
4191
- }
4192
- ];
4193
- }
4194
- return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }));
4195
- }
4196
- function syntheticRunRecord(input) {
4197
- const scoreKey = input.split === "holdout" ? "holdoutScore" : "searchScore";
4198
- return {
4199
- runId: input.runId,
4200
- experimentId: input.target,
4201
- candidateId: input.variant.id,
4202
- seed: input.seed,
4203
- model: "production-loop@synthetic",
4204
- promptHash: "0".repeat(64),
4205
- configHash: "0".repeat(64),
4206
- commitSha: "0".repeat(40),
4207
- wallMs: input.trial.durationMs ?? 1,
4208
- costUsd: input.trial.cost ?? 0,
4209
- tokenUsage: { input: 0, output: 0 },
4210
- outcome: {
4211
- [scoreKey]: input.trial.score,
4212
- raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }
4213
- },
4214
- splitTag: input.split,
4215
- scenarioId: input.scenarioId
4216
- };
4217
- }
4218
- function toPromptString(payload) {
4219
- if (typeof payload === "string") return payload;
4220
- if (payload == null) return "";
4221
- try {
4222
- return JSON.stringify(payload, null, 2);
4223
- } catch {
4224
- return String(payload);
4225
- }
4226
- }
4227
- function defaultRenderBody(ctx) {
4228
- const cluster = ctx.actedOnCluster;
4229
- const release = ctx.release;
4230
- const gate = ctx.gate;
4231
- const lines = [];
4232
- lines.push(`## Production-loop prompt update \u2014 \`${ctx.target}\``);
4233
- lines.push("");
4234
- lines.push(`Run id: \`${ctx.runId}\``);
4235
- lines.push(`Decision: \`${ctx.decision}\``);
4236
- lines.push(
4237
- `Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`
4238
- );
4239
- lines.push("");
4240
- if (cluster) {
4241
- lines.push("### Triggering failure cluster");
4242
- lines.push("");
4243
- lines.push(`- **class**: \`${cluster.failureClass}\``);
4244
- lines.push(`- **runs in cluster**: ${cluster.runCount}`);
4245
- lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`);
4246
- if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``);
4247
- if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``);
4248
- if (cluster.exampleError) {
4249
- lines.push(
4250
- `- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, " ")}\``
4251
- );
4252
- }
4253
- lines.push("");
4254
- }
4255
- if (gate) {
4256
- lines.push("### Held-out promotion gate");
4257
- lines.push("");
4258
- lines.push(`- **decision**: \`${gate.promote ? "PROMOTE" : "REJECT"}\``);
4259
- lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`);
4260
- lines.push(
4261
- `- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`
4262
- );
4263
- lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`);
4264
- lines.push(
4265
- `- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`
4266
- );
4267
- lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`);
4268
- lines.push("");
4269
- }
4270
- if (release) {
4271
- lines.push("### Release confidence");
4272
- lines.push("");
4273
- lines.push(`- **status**: \`${release.status}\``);
4274
- lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`);
4275
- lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`);
4276
- if (release.issues.length > 0) {
4277
- lines.push("- **issues**:");
4278
- for (const issue of release.issues) {
4279
- lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`);
4280
- }
4281
- }
4282
- lines.push("");
4283
- }
4284
- lines.push("### Prompt diff");
4285
- lines.push("");
4286
- lines.push("```diff");
4287
- lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString));
4288
- lines.push("```");
4289
- return lines.join("\n");
4290
- }
4291
- function unifiedDiff(a, b) {
4292
- const aLines = a.split("\n");
4293
- const bLines = b.split("\n");
4294
- const out = [];
4295
- const max = Math.max(aLines.length, bLines.length);
4296
- for (let i = 0; i < max; i++) {
4297
- const al = aLines[i];
4298
- const bl = bLines[i];
4299
- if (al === bl) continue;
4300
- if (al !== void 0) out.push(`- ${al}`);
4301
- if (bl !== void 0) out.push(`+ ${bl}`);
4302
- }
4303
- return out.join("\n");
4304
- }
4305
-
4306
4081
  // src/registry.ts
4307
4082
  var ScenarioRegistry = class {
4308
4083
  scenarios = [];
@@ -5395,6 +5170,89 @@ var FileSystemExperimentStore = class {
5395
5170
  }
5396
5171
  };
5397
5172
 
5173
+ // src/pareto.ts
5174
+ function dominates(a, b, objectives) {
5175
+ let strictlyBetter = false;
5176
+ for (const obj of objectives) {
5177
+ const av = obj.value(a);
5178
+ const bv = obj.value(b);
5179
+ if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
5180
+ const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
5181
+ const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
5182
+ if (aIsWorse) return false;
5183
+ if (aIsBetter) strictlyBetter = true;
5184
+ }
5185
+ return strictlyBetter;
5186
+ }
5187
+ function paretoFrontier(candidates, objectives) {
5188
+ if (objectives.length === 0) {
5189
+ throw new Error("paretoFrontier: at least 1 objective required");
5190
+ }
5191
+ const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
5192
+ const frontier = [];
5193
+ const dominated = [];
5194
+ for (const c of valid) {
5195
+ const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
5196
+ if (isDominated) dominated.push(c);
5197
+ else frontier.push(c);
5198
+ }
5199
+ const dominanceMap = frontier.map((d) => ({
5200
+ dominator: d,
5201
+ dominated: dominated.filter((x) => dominates(d, x, objectives))
5202
+ }));
5203
+ return { frontier, dominated, dominanceMap };
5204
+ }
5205
+ function scalarScore(candidates, objectives, options = {}) {
5206
+ if (candidates.length === 0) return [];
5207
+ const weights = options.weights ?? {};
5208
+ const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
5209
+ const ranges = objectives.map((obj) => {
5210
+ const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
5211
+ if (values.length === 0) return { min: 0, max: 1 };
5212
+ const min = Math.min(...values);
5213
+ const max = Math.max(...values);
5214
+ return { min, max: max === min ? min + 1 : max };
5215
+ });
5216
+ return candidates.map((c) => {
5217
+ let score = 0;
5218
+ objectives.forEach((obj, i) => {
5219
+ const v = obj.value(c);
5220
+ if (!Number.isFinite(v)) return;
5221
+ const { min, max } = ranges[i];
5222
+ const normalised = (v - min) / (max - min);
5223
+ const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
5224
+ const weight = (weights[obj.name] ?? 1) / totalWeight;
5225
+ score += directional * weight;
5226
+ });
5227
+ return { candidate: c, score };
5228
+ });
5229
+ }
5230
+ function crowdingDistance(candidates, objectives) {
5231
+ const distances = new Map(candidates.map((c) => [c, 0]));
5232
+ for (const obj of objectives) {
5233
+ const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
5234
+ const min = obj.value(sorted[0]);
5235
+ const max = obj.value(sorted[sorted.length - 1]);
5236
+ const range = max - min || 1;
5237
+ distances.set(sorted[0], Infinity);
5238
+ distances.set(sorted[sorted.length - 1], Infinity);
5239
+ for (let i = 1; i < sorted.length - 1; i++) {
5240
+ const prev = obj.value(sorted[i - 1]);
5241
+ const next = obj.value(sorted[i + 1]);
5242
+ const current = distances.get(sorted[i]);
5243
+ if (current === Infinity) continue;
5244
+ distances.set(sorted[i], current + (next - prev) / range);
5245
+ }
5246
+ }
5247
+ return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
5248
+ }
5249
+ function paretoFrontierWithCrowding(candidates, objectives) {
5250
+ const { frontier } = paretoFrontier(candidates, objectives);
5251
+ if (frontier.length === 0) return [];
5252
+ const distances = crowdingDistance(frontier, objectives);
5253
+ return distances.sort((a, b) => b.distance - a.distance);
5254
+ }
5255
+
5398
5256
  // src/harness-optimizer.ts
5399
5257
  var DEFAULT_HARNESS_OBJECTIVES = [
5400
5258
  { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
@@ -5485,20 +5343,20 @@ async function mapLimit(items, limit, fn) {
5485
5343
  function mean2(values) {
5486
5344
  return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
5487
5345
  }
5488
- function meanRunScore(scores) {
5346
+ function meanRunScore(scores2) {
5489
5347
  return {
5490
- success: mean2(scores.map((s) => s.success)),
5491
- goalProgress: mean2(scores.map((s) => s.goalProgress)),
5492
- repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
5493
- driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
5494
- toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
5495
- patchQuality: mean2(scores.map((s) => s.patchQuality)),
5496
- testReality: mean2(scores.map((s) => s.testReality)),
5497
- finalGate: mean2(scores.map((s) => s.finalGate)),
5498
- reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
5499
- costUsd: mean2(scores.map((s) => s.costUsd)),
5500
- wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
5501
- notes: scores.flatMap((s) => s.notes ?? [])
5348
+ success: mean2(scores2.map((s) => s.success)),
5349
+ goalProgress: mean2(scores2.map((s) => s.goalProgress)),
5350
+ repoGroundedness: mean2(scores2.map((s) => s.repoGroundedness)),
5351
+ driftPenalty: mean2(scores2.map((s) => s.driftPenalty)),
5352
+ toolUseQuality: mean2(scores2.map((s) => s.toolUseQuality)),
5353
+ patchQuality: mean2(scores2.map((s) => s.patchQuality)),
5354
+ testReality: mean2(scores2.map((s) => s.testReality)),
5355
+ finalGate: mean2(scores2.map((s) => s.finalGate)),
5356
+ reviewerBlockers: mean2(scores2.map((s) => s.reviewerBlockers)),
5357
+ costUsd: mean2(scores2.map((s) => s.costUsd)),
5358
+ wallSeconds: mean2(scores2.map((s) => s.wallSeconds)),
5359
+ notes: scores2.flatMap((s) => s.notes ?? [])
5502
5360
  };
5503
5361
  }
5504
5362
 
@@ -6473,12 +6331,12 @@ function recordRuns(runs, opts) {
6473
6331
  for (const [scenarioId, scenarioRuns] of byScenario) {
6474
6332
  const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
6475
6333
  if (scored.length === 0) continue;
6476
- const scores = scored.map((s) => s.score);
6334
+ const scores2 = scored.map((s) => s.score);
6477
6335
  const entry = {
6478
6336
  commitSha: opts.commitSha,
6479
6337
  timestamp,
6480
- scores,
6481
- composite: median(scores),
6338
+ scores: scores2,
6339
+ composite: median(scores2),
6482
6340
  runIds: scored.map((s) => s.run.runId)
6483
6341
  };
6484
6342
  const perDimension = aggregatePerDimension(scenarioRuns);
@@ -6600,17 +6458,17 @@ function formatScorecardDiff(diff) {
6600
6458
  lines.push(
6601
6459
  `Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
6602
6460
  );
6603
- const fmt = (n) => n.toFixed(3);
6461
+ const fmt2 = (n) => n.toFixed(3);
6604
6462
  const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
6605
6463
  if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
6606
6464
  return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
6607
6465
  });
6608
6466
  for (const cell of noteworthy) {
6609
6467
  const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
6610
- const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
6468
+ const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt2(cell.delta)}` : fmt2(cell.delta) : "\u2014";
6611
6469
  const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
6612
6470
  lines.push(
6613
- ` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
6471
+ ` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt2(cell.baseline ?? 0)} \u2192 ${fmt2(cell.current)} ${deltaStr}${stat}`
6614
6472
  );
6615
6473
  }
6616
6474
  return lines.join("\n");
@@ -6625,10 +6483,10 @@ function analyzeSeries(values, options = {}) {
6625
6483
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6626
6484
  }
6627
6485
  const tail = values.slice(-window);
6628
- const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
6629
- const variance = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
6486
+ const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
6487
+ const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
6630
6488
  const stdDev = Math.sqrt(variance);
6631
- const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
6489
+ const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6632
6490
  const cv = stdDev / refMean;
6633
6491
  const stable = tail.length >= window && cv <= stableCv;
6634
6492
  let tailRun = 0;
@@ -6649,7 +6507,7 @@ function analyzeSeries(values, options = {}) {
6649
6507
  } else {
6650
6508
  state = "noisy";
6651
6509
  }
6652
- return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
6510
+ return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
6653
6511
  }
6654
6512
 
6655
6513
  // src/slo.ts
@@ -7027,12 +6885,12 @@ function renderMarkdownReport(reports) {
7027
6885
  async function aggregateRunMetrics(runs, store) {
7028
6886
  if (runs.length === 0) return {};
7029
6887
  const durations = [];
7030
- const scores = [];
6888
+ const scores2 = [];
7031
6889
  const passes = [];
7032
6890
  const costs = [];
7033
6891
  for (const r of runs) {
7034
6892
  if (r.endedAt) durations.push(r.endedAt - r.startedAt);
7035
- if (r.outcome?.score !== void 0) scores.push(r.outcome.score);
6893
+ if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
7036
6894
  passes.push(r.outcome?.pass === true ? 1 : 0);
7037
6895
  const llm = await llmSpans(store, r.runId);
7038
6896
  costs.push(aggregateLlm(llm).costUsd);
@@ -7041,7 +6899,7 @@ async function aggregateRunMetrics(runs, store) {
7041
6899
  provisionMs: average(durations),
7042
6900
  firstTokenMs: average(durations),
7043
6901
  wallMs: average(durations),
7044
- overallScore: average(scores),
6902
+ overallScore: average(scores2),
7045
6903
  passRate: average(passes),
7046
6904
  costUsd: average(costs)
7047
6905
  };
@@ -7205,7 +7063,7 @@ async function toLangfuseEnvelope(store, runId) {
7205
7063
  },
7206
7064
  metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
7207
7065
  }));
7208
- const scores = judges.map((j) => ({
7066
+ const scores2 = judges.map((j) => ({
7209
7067
  id: j.spanId,
7210
7068
  traceId: run.runId,
7211
7069
  observationId: j.targetSpanId,
@@ -7213,7 +7071,7 @@ async function toLangfuseEnvelope(store, runId) {
7213
7071
  value: j.score,
7214
7072
  comment: j.rationale
7215
7073
  }));
7216
- return { traceId: run.runId, generations, scores };
7074
+ return { traceId: run.runId, generations, scores: scores2 };
7217
7075
  }
7218
7076
  async function toPrometheusText(store) {
7219
7077
  const runs = await store.listRuns();
@@ -7314,12 +7172,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7314
7172
  variantScores.push({ mutator: id, score, mutated });
7315
7173
  all.push(score);
7316
7174
  }
7317
- const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
7318
- const variance = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
7175
+ const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
7176
+ const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
7319
7177
  const stdDev = Math.sqrt(variance);
7320
- const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
7178
+ const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
7321
7179
  const robustness = Math.max(0, 1 - stdDev / ref);
7322
- return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
7180
+ return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
7323
7181
  }
7324
7182
  var lowercaseMutator = (p) => p.toLowerCase();
7325
7183
  var sentenceReorderMutator = (p, seed) => {
@@ -7376,18 +7234,18 @@ async function paraphraseRobustnessScenarios(args) {
7376
7234
  const deltas = {};
7377
7235
  const paraphrasedAll = [];
7378
7236
  for (const m of args.mutators) {
7379
- const scores = [];
7237
+ const scores2 = [];
7380
7238
  for (let r = 0; r < reps; r++) {
7381
7239
  const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
7382
7240
  const out = await args.runScenario({
7383
7241
  id: scenario.id,
7384
7242
  userTurns: mutatedTurns
7385
7243
  });
7386
- scores.push(out.score);
7244
+ scores2.push(out.score);
7387
7245
  }
7388
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7389
- deltas[m.name] = mean4 - originalScore;
7390
- paraphrasedAll.push(...scores);
7246
+ const mean5 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7247
+ deltas[m.name] = mean5 - originalScore;
7248
+ paraphrasedAll.push(...scores2);
7391
7249
  }
7392
7250
  const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
7393
7251
  perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
@@ -7802,10 +7660,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7802
7660
  }
7803
7661
  for (const s of scenarios) {
7804
7662
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7805
- const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7806
- if (scores.length < 3) continue;
7807
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7808
- const variance = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
7663
+ const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7664
+ if (scores2.length < 3) continue;
7665
+ const mean5 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7666
+ const variance = scores2.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores2.length;
7809
7667
  if (variance > varianceThreshold) {
7810
7668
  targets.push({
7811
7669
  reason: "high-variance",
@@ -7979,15 +7837,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7979
7837
  const rejected = [];
7980
7838
  const surviving = [];
7981
7839
  for (const candidate of proposed) {
7982
- const scores = await scorer.scoreCandidate(candidate, targets);
7983
- if (scores.length < 2) {
7840
+ const scores2 = await scorer.scoreCandidate(candidate, targets);
7841
+ if (scores2.length < 2) {
7984
7842
  rejected.push({ candidate, reason: "scorer returned <2 results" });
7985
7843
  continue;
7986
7844
  }
7987
- const values = scores.map((s) => s.score);
7845
+ const values = scores2.map((s) => s.score);
7988
7846
  const spread = Math.max(...values) - Math.min(...values);
7989
7847
  const maxScore = Math.max(...values);
7990
- scored.push({ candidate, scores, spread });
7848
+ scored.push({ candidate, scores: scores2, spread });
7991
7849
  if (maxScore < floor) {
7992
7850
  rejected.push({
7993
7851
  candidate,
@@ -9138,15 +8996,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
9138
8996
  const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
9139
8997
  const matchStrategy = options.matchStrategy ?? "reference-order";
9140
8998
  const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9141
- const scores = scenarios.filter((scenario) => {
8999
+ const scores2 = scenarios.filter((scenario) => {
9142
9000
  const split = scenario.split ?? "train";
9143
9001
  if (split === "holdout" && !options.includeHoldout) return false;
9144
9002
  return allowedSplits.has(split);
9145
9003
  }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
9146
9004
  return {
9147
- scenarios: scores,
9148
- aggregate: aggregateScenarioScores(scores),
9149
- bySplit: aggregateBySplit(scores)
9005
+ scenarios: scores2,
9006
+ aggregate: aggregateScenarioScores(scores2),
9007
+ bySplit: aggregateBySplit(scores2)
9150
9008
  };
9151
9009
  }
9152
9010
  function compareReferenceReplay(baseline, candidate) {
@@ -9369,20 +9227,20 @@ function buildScenarioScore(scenario, matches, falsePositives) {
9369
9227
  matches
9370
9228
  };
9371
9229
  }
9372
- function aggregateBySplit(scores) {
9230
+ function aggregateBySplit(scores2) {
9373
9231
  const out = {};
9374
9232
  for (const split of ALL_SPLITS) {
9375
- const scoped = scores.filter((score) => score.split === split);
9233
+ const scoped = scores2.filter((score) => score.split === split);
9376
9234
  if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
9377
9235
  }
9378
9236
  return out;
9379
9237
  }
9380
- function aggregateScenarioScores(scores) {
9381
- const matched = sum2(scores.map((score) => score.matched));
9382
- const total = sum2(scores.map((score) => score.total));
9383
- const falsePositives = sum2(scores.map((score) => score.falsePositives));
9384
- const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
9385
- const totalWeight = sum2(scores.map((score) => score.totalWeight));
9238
+ function aggregateScenarioScores(scores2) {
9239
+ const matched = sum2(scores2.map((score) => score.matched));
9240
+ const total = sum2(scores2.map((score) => score.total));
9241
+ const falsePositives = sum2(scores2.map((score) => score.falsePositives));
9242
+ const matchedWeight = sum2(scores2.map((score) => score.matchedWeight));
9243
+ const totalWeight = sum2(scores2.map((score) => score.totalWeight));
9386
9244
  const precision2 = ratio(matched, matched + falsePositives);
9387
9245
  const recall = ratio(matched, total);
9388
9246
  return {
@@ -9625,154 +9483,6 @@ function createDefaultReviewer(options) {
9625
9483
  };
9626
9484
  }
9627
9485
 
9628
- // src/code-mutator.ts
9629
- function createSandboxCodeMutator(opts) {
9630
- const childIdFor = opts.childIdFor ?? ((parent, generation, index) => `${parent.id}.g${generation}.code.${index}`);
9631
- const labelFor = opts.labelFor ?? ((outcome, parent, _generation, index) => outcome.description?.slice(0, 80) ?? `${parent.label} \u2192 code.${index}`);
9632
- return {
9633
- async mutate(args) {
9634
- const { parent, parentAggregate, topTrials, bottomTrials, childCount, generation } = args;
9635
- const startedAt = Date.now();
9636
- const outcomes = await opts.pool.withSlot(async (slot) => {
9637
- try {
9638
- return await opts.runner({
9639
- slot,
9640
- parent,
9641
- parentAggregate,
9642
- topTrials,
9643
- bottomTrials,
9644
- childCount,
9645
- generation
9646
- });
9647
- } catch (err) {
9648
- return [
9649
- {
9650
- ok: false,
9651
- failureReason: "runner_error",
9652
- description: err instanceof Error ? err.message : String(err),
9653
- latencyMs: Date.now() - startedAt
9654
- }
9655
- ];
9656
- }
9657
- });
9658
- const variants = [];
9659
- let index = 0;
9660
- for (const outcome of outcomes) {
9661
- const childId = outcome.childId ?? childIdFor(parent, generation, index);
9662
- if (opts.mutationTelemetry) {
9663
- await opts.mutationTelemetry.record({
9664
- ts: Date.now(),
9665
- channel: "code",
9666
- generation,
9667
- parentId: parent.id,
9668
- childId: outcome.ok ? childId : null,
9669
- ok: outcome.ok,
9670
- failureReason: outcome.failureReason,
9671
- description: outcome.description,
9672
- latencyMs: outcome.latencyMs,
9673
- diffBytes: outcome.diffBytes,
9674
- filesTouched: outcome.filesTouched,
9675
- agentSteps: outcome.agentSteps,
9676
- costUsd: outcome.costUsd
9677
- });
9678
- }
9679
- if (opts.costLedger && outcome.costUsd !== void 0) {
9680
- await opts.costLedger.addMutation("code", outcome.costUsd, { generation });
9681
- }
9682
- if (outcome.ok) {
9683
- const variant = {
9684
- id: childId,
9685
- payload: opts.toVariantPayload(outcome, parent),
9686
- generation,
9687
- parentId: parent.id,
9688
- label: labelFor(outcome, parent, generation, index),
9689
- ...outcome.rationale ? { rationale: outcome.rationale } : {}
9690
- };
9691
- variants.push(variant);
9692
- if (opts.lineage) {
9693
- await opts.lineage.upsert({
9694
- id: variant.id,
9695
- parentId: variant.parentId ?? null,
9696
- generation: variant.generation,
9697
- kind: "code",
9698
- ...variant.rationale ? { rationale: variant.rationale } : {}
9699
- });
9700
- }
9701
- }
9702
- index++;
9703
- }
9704
- if (opts.costLedger) {
9705
- const u = opts.pool.utilization();
9706
- await opts.costLedger.setPoolUtilization(u.busyMs, u.totalMs);
9707
- }
9708
- return variants;
9709
- }
9710
- };
9711
- }
9712
-
9713
- // src/composite-mutator.ts
9714
- function createCompositeMutator(opts) {
9715
- const recentScores = [];
9716
- const plateauThreshold = opts.plateauThreshold ?? 0.02;
9717
- const plateauPatience = opts.plateauPatience ?? 2;
9718
- function pickMode(args) {
9719
- recentScores.push(args.parentAggregate.meanScore);
9720
- switch (opts.policy) {
9721
- case "primary-only":
9722
- return { mode: "primary", reason: "policy=primary-only" };
9723
- case "secondary-only":
9724
- if (!opts.secondary)
9725
- return {
9726
- mode: "primary",
9727
- reason: "secondary-only requested but no secondary mutator wired"
9728
- };
9729
- return { mode: "secondary", reason: "policy=secondary-only" };
9730
- case "alternate":
9731
- if (!opts.secondary)
9732
- return { mode: "primary", reason: "alternate requested but no secondary mutator wired" };
9733
- return args.generation % 2 === 1 ? { mode: "secondary", reason: `alternate: gen${args.generation} odd \u2192 secondary` } : { mode: "primary", reason: `alternate: gen${args.generation} even \u2192 primary` };
9734
- case "plateau": {
9735
- if (!opts.secondary)
9736
- return { mode: "primary", reason: "plateau requested but no secondary mutator wired" };
9737
- if (recentScores.length <= plateauPatience) {
9738
- return { mode: "primary", reason: "plateau: warming up with primary mutations" };
9739
- }
9740
- const window = recentScores.slice(-plateauPatience - 1);
9741
- const deltas = window.slice(1).map((v, i) => v - window[i]);
9742
- const stagnant = deltas.every((d) => d < plateauThreshold);
9743
- if (stagnant) {
9744
- return {
9745
- mode: "split",
9746
- reason: `plateau detected (${deltas.map((d) => d.toFixed(3)).join(", ")}) \u2192 split`
9747
- };
9748
- }
9749
- return {
9750
- mode: "primary",
9751
- reason: `plateau: still improving (${deltas[deltas.length - 1].toFixed(3)})`
9752
- };
9753
- }
9754
- }
9755
- }
9756
- return {
9757
- async mutate(args) {
9758
- const { mode, reason } = pickMode(args);
9759
- opts.onPolicyDecision?.({ generation: args.generation, chose: mode, reason });
9760
- if (mode === "primary") return opts.primary.mutate(args);
9761
- if (mode === "secondary" && opts.secondary) return opts.secondary.mutate(args);
9762
- if (mode === "split" && opts.secondary) {
9763
- const secondaryShare = Math.ceil(args.childCount / 2);
9764
- const primaryShare = args.childCount - secondaryShare;
9765
- const [primaryChildren, secondaryChildren] = await Promise.all([
9766
- opts.primary.mutate({ ...args, childCount: primaryShare }),
9767
- opts.secondary.mutate({ ...args, childCount: secondaryShare })
9768
- ]);
9769
- return [...primaryChildren, ...secondaryChildren];
9770
- }
9771
- return opts.primary.mutate(args);
9772
- }
9773
- };
9774
- }
9775
-
9776
9486
  // src/discover-personas.ts
9777
9487
  import { promises as fs } from "fs";
9778
9488
  import { basename, extname, join as join3 } from "path";
@@ -9819,238 +9529,6 @@ async function discoverPersonas(dir, opts = {}) {
9819
9529
  return results;
9820
9530
  }
9821
9531
 
9822
- // src/evolution-telemetry.ts
9823
- import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
9824
- import { dirname as dirname4 } from "path";
9825
- var MutationTelemetry = class {
9826
- appender;
9827
- constructor(path) {
9828
- this.appender = new LockedJsonlAppender(path);
9829
- }
9830
- async record(attempt) {
9831
- await this.appender.append(attempt);
9832
- }
9833
- };
9834
- var TrialTelemetry = class {
9835
- appender;
9836
- constructor(path) {
9837
- this.appender = new LockedJsonlAppender(path);
9838
- }
9839
- async record(attempt) {
9840
- await this.appender.append(attempt);
9841
- }
9842
- };
9843
- var LineageRecorder = class {
9844
- path;
9845
- snapshotPath;
9846
- mutex = new Mutex();
9847
- nodes = /* @__PURE__ */ new Map();
9848
- kindOf;
9849
- constructor(path, kindOf) {
9850
- this.path = path;
9851
- this.snapshotPath = `${path}.snapshot`;
9852
- this.kindOf = kindOf ?? defaultKindOf;
9853
- mkdirSync4(dirname4(path), { recursive: true });
9854
- if (existsSync7(this.snapshotPath)) {
9855
- try {
9856
- const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
9857
- for (const n of parsed) this.nodes.set(n.id, n);
9858
- } catch {
9859
- }
9860
- }
9861
- if (existsSync7(path)) {
9862
- try {
9863
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
9864
- if (!line.trim()) continue;
9865
- try {
9866
- const entry = JSON.parse(line);
9867
- const prev = this.nodes.get(entry.id);
9868
- this.nodes.set(entry.id, { ...prev, ...entry });
9869
- } catch {
9870
- }
9871
- }
9872
- } catch {
9873
- }
9874
- }
9875
- if (existsSync7(path) && this.nodes.size === 0) {
9876
- try {
9877
- const raw = readFileSync6(path, "utf-8").trim();
9878
- if (raw.startsWith("[")) {
9879
- const parsed = JSON.parse(raw);
9880
- for (const n of parsed) this.nodes.set(n.id, n);
9881
- }
9882
- } catch {
9883
- }
9884
- }
9885
- }
9886
- async upsert(node) {
9887
- await this.mutex.runExclusive(() => {
9888
- const prev = this.nodes.get(node.id);
9889
- this.nodes.set(node.id, { ...prev, ...node });
9890
- try {
9891
- if (existsSync7(this.path)) {
9892
- const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
9893
- if (head === "[") {
9894
- writeFileSync(this.path, "");
9895
- }
9896
- }
9897
- } catch {
9898
- }
9899
- appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
9900
- `);
9901
- });
9902
- }
9903
- async upsertVariant(variant, opts = {}) {
9904
- await this.upsert({
9905
- id: variant.id,
9906
- parentId: variant.parentId ?? null,
9907
- generation: variant.generation,
9908
- kind: this.kindOf(variant),
9909
- ...variant.rationale ? { rationale: variant.rationale } : {},
9910
- ...opts.omitPayload || variant.payload === void 0 ? {} : { payload: variant.payload }
9911
- });
9912
- }
9913
- snapshot() {
9914
- return [...this.nodes.values()];
9915
- }
9916
- /**
9917
- * Write the current consolidated state to `<path>.snapshot` so external
9918
- * tools can read it without replaying the event log. Idempotent.
9919
- */
9920
- async compact() {
9921
- await this.mutex.runExclusive(() => {
9922
- writeFileSync(this.snapshotPath, JSON.stringify([...this.nodes.values()], null, 2));
9923
- });
9924
- }
9925
- };
9926
- function defaultKindOf(variant) {
9927
- if (variant.parentId === void 0) return "seed";
9928
- const payload = variant.payload;
9929
- if (payload && typeof payload === "object" && payload.codeMutation) return "code";
9930
- return "prompt";
9931
- }
9932
- function emptyGenBucket() {
9933
- return {
9934
- mutatorPromptUsd: 0,
9935
- mutatorCodeUsd: 0,
9936
- scorerPromptUsd: 0,
9937
- scorerCodeUsd: 0,
9938
- trialsCounted: 0,
9939
- cachedTrials: 0
9940
- };
9941
- }
9942
- var CostLedger = class {
9943
- totals = {
9944
- mutatorPromptUsd: 0,
9945
- mutatorCodeUsd: 0,
9946
- scorerPromptUsd: 0,
9947
- scorerCodeUsd: 0,
9948
- trialsCounted: 0,
9949
- cachedTrials: 0,
9950
- poolBusyMs: 0,
9951
- poolUtilizationPct: 0,
9952
- byGeneration: {}
9953
- };
9954
- path;
9955
- mutex = new Mutex();
9956
- constructor(path) {
9957
- this.path = path;
9958
- if (existsSync7(path)) {
9959
- try {
9960
- const loaded = JSON.parse(readFileSync6(path, "utf-8"));
9961
- for (const k of Object.keys(this.totals)) {
9962
- if (k === "byGeneration") {
9963
- if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
9964
- this.totals.byGeneration = loaded.byGeneration;
9965
- }
9966
- continue;
9967
- }
9968
- const v = loaded[k];
9969
- if (typeof v === "number" && Number.isFinite(v)) {
9970
- ;
9971
- this.totals[k] = v;
9972
- }
9973
- }
9974
- } catch {
9975
- }
9976
- } else {
9977
- mkdirSync4(dirname4(path), { recursive: true });
9978
- }
9979
- }
9980
- genBucket(generation) {
9981
- if (generation === void 0) return null;
9982
- const key = String(generation);
9983
- if (!this.totals.byGeneration[key]) {
9984
- this.totals.byGeneration[key] = emptyGenBucket();
9985
- }
9986
- return this.totals.byGeneration[key];
9987
- }
9988
- async addMutation(channel, usd, opts = {}) {
9989
- await this.mutex.runExclusive(() => {
9990
- const bucket = this.genBucket(opts.generation);
9991
- if (channel === "prompt") {
9992
- this.totals.mutatorPromptUsd += usd;
9993
- if (bucket) bucket.mutatorPromptUsd += usd;
9994
- } else {
9995
- this.totals.mutatorCodeUsd += usd;
9996
- if (bucket) bucket.mutatorCodeUsd += usd;
9997
- }
9998
- this.persist();
9999
- });
10000
- }
10001
- async addTrial(channel, usd, cached, opts = {}) {
10002
- await this.mutex.runExclusive(() => {
10003
- const bucket = this.genBucket(opts.generation);
10004
- if (cached) {
10005
- this.totals.cachedTrials++;
10006
- this.totals.trialsCounted++;
10007
- if (bucket) {
10008
- bucket.cachedTrials++;
10009
- bucket.trialsCounted++;
10010
- }
10011
- this.persist();
10012
- return;
10013
- }
10014
- if (channel === "prompt") {
10015
- this.totals.scorerPromptUsd += usd;
10016
- if (bucket) bucket.scorerPromptUsd += usd;
10017
- } else {
10018
- this.totals.scorerCodeUsd += usd;
10019
- if (bucket) bucket.scorerCodeUsd += usd;
10020
- }
10021
- this.totals.trialsCounted++;
10022
- if (bucket) bucket.trialsCounted++;
10023
- this.persist();
10024
- });
10025
- }
10026
- async setPoolUtilization(busyMs, totalMs) {
10027
- await this.mutex.runExclusive(() => {
10028
- this.totals.poolBusyMs = busyMs;
10029
- this.totals.poolUtilizationPct = totalMs > 0 ? 100 * busyMs / totalMs : 0;
10030
- this.persist();
10031
- });
10032
- }
10033
- snapshot() {
10034
- const totalUsd = this.totals.mutatorPromptUsd + this.totals.mutatorCodeUsd + this.totals.scorerPromptUsd + this.totals.scorerCodeUsd;
10035
- const byGeneration = Object.entries(this.totals.byGeneration).map(([g, b]) => ({ generation: Number(g), ...b })).sort((a, b) => a.generation - b.generation);
10036
- return {
10037
- totalUsd,
10038
- mutatorPromptUsd: this.totals.mutatorPromptUsd,
10039
- mutatorCodeUsd: this.totals.mutatorCodeUsd,
10040
- scorerPromptUsd: this.totals.scorerPromptUsd,
10041
- scorerCodeUsd: this.totals.scorerCodeUsd,
10042
- trialsCounted: this.totals.trialsCounted,
10043
- cachedTrials: this.totals.cachedTrials,
10044
- poolBusyMs: this.totals.poolBusyMs,
10045
- poolUtilizationPct: this.totals.poolUtilizationPct,
10046
- byGeneration
10047
- };
10048
- }
10049
- persist() {
10050
- writeFileSync(this.path, JSON.stringify(this.totals, null, 2));
10051
- }
10052
- };
10053
-
10054
9532
  // src/golden-matcher.ts
10055
9533
  function matchGoldens(goldens, candidates, options = {}) {
10056
9534
  const extract = options.text ?? defaultExtract2;
@@ -10125,52 +9603,164 @@ function precision(goldens, candidates, options = {}) {
10125
9603
  return matched / candidates.length;
10126
9604
  }
10127
9605
 
10128
- // src/jsonl-trial-cache.ts
10129
- import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
10130
- import { dirname as dirname5 } from "path";
10131
- var JsonlTrialCache = class {
10132
- map = /* @__PURE__ */ new Map();
10133
- path;
10134
- appender;
10135
- constructor(path) {
10136
- this.path = path;
10137
- if (existsSync8(path)) {
10138
- for (const line of readFileSync7(path, "utf-8").split("\n")) {
10139
- if (!line.trim()) continue;
10140
- try {
10141
- const entry = JSON.parse(line);
10142
- this.map.set(entry.key, entry.result);
10143
- } catch {
10144
- }
10145
- }
10146
- } else {
10147
- mkdirSync5(dirname5(path), { recursive: true });
9606
+ // src/held-out-gate.ts
9607
+ var HeldOutGate = class {
9608
+ minProductiveRuns;
9609
+ pairedDeltaThreshold;
9610
+ overfitGapThreshold;
9611
+ baselineKey;
9612
+ confidence;
9613
+ resamples;
9614
+ seed;
9615
+ constructor(config) {
9616
+ if (!config.baselineKey) {
9617
+ throw new Error("HeldOutGate: baselineKey is required");
9618
+ }
9619
+ this.minProductiveRuns = config.minProductiveRuns ?? 3;
9620
+ this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
9621
+ this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
9622
+ this.baselineKey = config.baselineKey;
9623
+ this.confidence = config.confidence ?? 0.95;
9624
+ this.resamples = config.bootstrapResamples ?? 2e3;
9625
+ this.seed = config.seed;
9626
+ }
9627
+ /** Decide whether `candidate` should replace `baseline`. Pairing
9628
+ * is by (experimentId, seed) — identical experiment + seed pairs
9629
+ * the candidate run with the matching baseline run. Pairs without
9630
+ * a holdout score on both sides are dropped. */
9631
+ evaluate(candidate, baseline) {
9632
+ const candidateId = inferCandidateId(candidate, this.baselineKey);
9633
+ const baselineId = this.baselineKey;
9634
+ const baselineHoldoutByKey = indexHoldoutByKey(baseline);
9635
+ const beforeHoldout = [];
9636
+ const afterHoldout = [];
9637
+ for (const run of candidate) {
9638
+ if (run.splitTag !== "holdout") continue;
9639
+ if (run.outcome.holdoutScore === void 0) continue;
9640
+ const key = pairKey(run);
9641
+ const counterpart = baselineHoldoutByKey.get(key);
9642
+ if (counterpart === void 0) continue;
9643
+ beforeHoldout.push(counterpart);
9644
+ afterHoldout.push(run.outcome.holdoutScore);
9645
+ }
9646
+ const productiveRuns = beforeHoldout.length;
9647
+ const candidateSearchMean = mean4(scores(candidate, "searchScore", "search"));
9648
+ const candidateHoldoutMean = mean4(scores(candidate, "holdoutScore", "holdout"));
9649
+ const baselineSearchMean = mean4(scores(baseline, "searchScore", "search"));
9650
+ const baselineHoldoutMean = mean4(scores(baseline, "holdoutScore", "holdout"));
9651
+ const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
9652
+ const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
9653
+ if (productiveRuns < this.minProductiveRuns) {
9654
+ return {
9655
+ promote: false,
9656
+ candidateId,
9657
+ baselineId,
9658
+ evidence: {
9659
+ productiveRuns,
9660
+ medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
9661
+ pairedCI: { low: 0, high: 0 },
9662
+ pairedPValue: 1,
9663
+ searchScore: candidateSearchMean,
9664
+ holdoutScore: candidateHoldoutMean,
9665
+ overfitGap,
9666
+ baselineOverfitGap
9667
+ },
9668
+ reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
9669
+ rejectionCode: "few_runs"
9670
+ };
10148
9671
  }
10149
- this.appender = new LockedJsonlAppender(path);
10150
- }
10151
- get(key) {
10152
- return this.map.get(key);
9672
+ const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
9673
+ confidence: this.confidence,
9674
+ resamples: this.resamples,
9675
+ statistic: "median",
9676
+ seed: this.seed
9677
+ });
9678
+ const wilcoxon = wilcoxonSignedRank(beforeHoldout, afterHoldout);
9679
+ const evidence = {
9680
+ productiveRuns,
9681
+ medianPairedDelta: ci.median,
9682
+ pairedCI: { low: ci.low, high: ci.high },
9683
+ pairedPValue: wilcoxon.p,
9684
+ searchScore: candidateSearchMean,
9685
+ holdoutScore: candidateHoldoutMean,
9686
+ overfitGap,
9687
+ baselineOverfitGap
9688
+ };
9689
+ if (!(ci.low > this.pairedDeltaThreshold)) {
9690
+ return {
9691
+ promote: false,
9692
+ candidateId,
9693
+ baselineId,
9694
+ evidence,
9695
+ reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
9696
+ rejectionCode: "negative_delta"
9697
+ };
9698
+ }
9699
+ if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
9700
+ return {
9701
+ promote: false,
9702
+ candidateId,
9703
+ baselineId,
9704
+ evidence,
9705
+ reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
9706
+ rejectionCode: "overfit_gap"
9707
+ };
9708
+ }
9709
+ return {
9710
+ promote: true,
9711
+ candidateId,
9712
+ baselineId,
9713
+ evidence,
9714
+ reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
9715
+ rejectionCode: null
9716
+ };
10153
9717
  }
10154
- set(key, value) {
10155
- this.map.set(key, value);
10156
- const line = { key, result: value, writtenAt: Date.now() };
10157
- void this.appender.append(line);
9718
+ };
9719
+ function inferCandidateId(candidate, baselineKey) {
9720
+ for (const run of candidate) {
9721
+ if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
10158
9722
  }
10159
- size() {
10160
- return this.map.size;
9723
+ return candidate[0]?.candidateId ?? "(unknown candidate)";
9724
+ }
9725
+ function indexHoldoutByKey(runs) {
9726
+ const out = /* @__PURE__ */ new Map();
9727
+ for (const r of runs) {
9728
+ if (r.splitTag !== "holdout") continue;
9729
+ if (r.outcome.holdoutScore === void 0) continue;
9730
+ out.set(pairKey(r), r.outcome.holdoutScore);
10161
9731
  }
10162
- /**
10163
- * Synchronous fallback path for tests / CLI tools that want to be sure
10164
- * the line is on disk before returning. Bypasses the mutex (single-
10165
- * threaded callers only).
10166
- */
10167
- setSync(key, value) {
10168
- this.map.set(key, value);
10169
- const line = { key, result: value, writtenAt: Date.now() };
10170
- appendFileSync5(this.path, `${JSON.stringify(line)}
10171
- `);
9732
+ return out;
9733
+ }
9734
+ function pairKey(r) {
9735
+ return `${r.experimentId}::${r.seed}`;
9736
+ }
9737
+ function scores(runs, field, splitFilter) {
9738
+ const out = [];
9739
+ for (const r of runs) {
9740
+ if (r.splitTag !== splitFilter) continue;
9741
+ const v = r.outcome[field];
9742
+ if (typeof v === "number" && Number.isFinite(v)) out.push(v);
10172
9743
  }
10173
- };
9744
+ return out;
9745
+ }
9746
+ function mean4(xs) {
9747
+ if (xs.length === 0) return Number.NaN;
9748
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
9749
+ }
9750
+ function safeDiff(a, b) {
9751
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
9752
+ return a - b;
9753
+ }
9754
+ function medianDelta(before, after) {
9755
+ const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
9756
+ if (ds.length === 0) return 0;
9757
+ const mid = Math.floor(ds.length / 2);
9758
+ return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
9759
+ }
9760
+ function fmt(x) {
9761
+ if (!Number.isFinite(x)) return String(x);
9762
+ return x.toFixed(4);
9763
+ }
10174
9764
 
10175
9765
  // src/judge-retry.ts
10176
9766
  var DEFAULT_MAX_ATTEMPTS = 3;
@@ -10250,9 +9840,9 @@ function passOrthogonality(input) {
10250
9840
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10251
9841
  }
10252
9842
  }
10253
- const mean4 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
9843
+ const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10254
9844
  return {
10255
- orthogonality: Math.max(0, Math.min(1, 1 - mean4)),
9845
+ orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
10256
9846
  passCount: passes.length,
10257
9847
  similarities: sims
10258
9848
  };
@@ -10351,6 +9941,44 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
10351
9941
  };
10352
9942
  }
10353
9943
 
9944
+ // src/researcher.ts
9945
+ var CallbackResearcher = class {
9946
+ constructor(callbacks) {
9947
+ this.callbacks = callbacks;
9948
+ }
9949
+ callbacks;
9950
+ inspectFailures(runs) {
9951
+ return this.callbacks.inspectFailures(runs);
9952
+ }
9953
+ proposeChange(failures) {
9954
+ return this.callbacks.proposeChange(failures);
9955
+ }
9956
+ applyChange(changes, baseline) {
9957
+ return this.callbacks.applyChange(changes, baseline);
9958
+ }
9959
+ evaluateChange(plan) {
9960
+ return this.callbacks.evaluateChange(plan);
9961
+ }
9962
+ };
9963
+ var NoopResearcher = class {
9964
+ hint;
9965
+ constructor(hint = "NoopResearcher: no implementation wired") {
9966
+ this.hint = hint;
9967
+ }
9968
+ async inspectFailures(_runs) {
9969
+ throw new Error(`${this.hint} (inspectFailures not implemented)`);
9970
+ }
9971
+ async proposeChange(_failures) {
9972
+ throw new Error(`${this.hint} (proposeChange not implemented)`);
9973
+ }
9974
+ async applyChange(_changes, _baseline) {
9975
+ throw new Error(`${this.hint} (applyChange not implemented)`);
9976
+ }
9977
+ async evaluateChange(_plan) {
9978
+ throw new Error(`${this.hint} (evaluateChange not implemented)`);
9979
+ }
9980
+ };
9981
+
10354
9982
  // src/sandbox-pool.ts
10355
9983
  function createSandboxPool(opts) {
10356
9984
  if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`);
@@ -10453,55 +10081,6 @@ function createSandboxPool(opts) {
10453
10081
  };
10454
10082
  }
10455
10083
 
10456
- // src/trial-aggregator.ts
10457
- function meanOf(xs) {
10458
- if (xs.length === 0) return 0;
10459
- return xs.reduce((a, b) => a + b, 0) / xs.length;
10460
- }
10461
- function meanMetrics(rows) {
10462
- if (rows.length === 0) return {};
10463
- const keys = /* @__PURE__ */ new Set();
10464
- for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
10465
- const out = {};
10466
- for (const k of keys) {
10467
- const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
10468
- if (xs.length > 0) out[k] = meanOf(xs);
10469
- }
10470
- return out;
10471
- }
10472
- function aggregateTrialsByMode(trials, opts) {
10473
- const gradedTrials = trials.filter((t) => !t.error);
10474
- const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
10475
- const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
10476
- if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
10477
- return {
10478
- meanScore: 0,
10479
- meanCost: 0,
10480
- meanDurationMs: 0,
10481
- okRate: 0,
10482
- countedTrials: 0,
10483
- excludedFailedTrials: judgeFailed.length,
10484
- totalTrials: trials.length,
10485
- metrics: {},
10486
- strictFailure: {
10487
- failedCount: judgeFailed.length,
10488
- firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
10489
- }
10490
- };
10491
- }
10492
- const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
10493
- return {
10494
- meanScore: meanOf(counted.map((t) => t.score)),
10495
- meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
10496
- meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
10497
- okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
10498
- countedTrials: counted.length,
10499
- excludedFailedTrials: judgeFailed.length,
10500
- totalTrials: trials.length,
10501
- metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
10502
- };
10503
- }
10504
-
10505
10084
  // src/otel-pipeline.ts
10506
10085
  function withOtelPipeline(opts) {
10507
10086
  const config = {
@@ -10594,17 +10173,17 @@ function traceJudge(judge, judgeName, opts) {
10594
10173
  }
10595
10174
  });
10596
10175
  try {
10597
- const scores = await judge(tc, input);
10598
- const composite = scores.length > 0 ? scores.reduce((sum3, s) => sum3 + s.score, 0) / scores.length : 0;
10176
+ const scores2 = await judge(tc, input);
10177
+ const composite = scores2.length > 0 ? scores2.reduce((sum3, s) => sum3 + s.score, 0) / scores2.length : 0;
10599
10178
  await span.end({
10600
10179
  attributes: {
10601
10180
  "judge.name": judgeName,
10602
10181
  "judge.composite_score": composite,
10603
- "judge.dimension_count": scores.length,
10182
+ "judge.dimension_count": scores2.length,
10604
10183
  "eval.phase": "judge"
10605
10184
  }
10606
10185
  });
10607
- return scores;
10186
+ return scores2;
10608
10187
  } catch (err) {
10609
10188
  await span.fail(err instanceof Error ? err : String(err));
10610
10189
  throw err;
@@ -10631,8 +10210,8 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
10631
10210
  emitter: opts.emitter,
10632
10211
  parentSpanId: ensembleSpan.span.spanId
10633
10212
  });
10634
- const scores = await tracedFn(tc, input);
10635
- allScores.push(...scores);
10213
+ const scores2 = await tracedFn(tc, input);
10214
+ allScores.push(...scores2);
10636
10215
  }
10637
10216
  const composite = allScores.length > 0 ? allScores.reduce((sum3, s) => sum3 + s.score, 0) / allScores.length : 0;
10638
10217
  await ensembleSpan.end({
@@ -10650,48 +10229,6 @@ function traceJudgeEnsemble(judges, judgeNames, opts) {
10650
10229
  }
10651
10230
  };
10652
10231
  }
10653
-
10654
- // src/traced-mutator.ts
10655
- function traceMutator(adapter, opts) {
10656
- return {
10657
- async mutate(args) {
10658
- const span = await opts.emitter.span({
10659
- kind: "llm",
10660
- name: `mutator:gen-${args.generation}`,
10661
- parentSpanId: opts.parentSpanId,
10662
- attributes: {
10663
- "mutator.parent_id": args.parent.id,
10664
- "mutator.generation": args.generation,
10665
- "mutator.child_count": args.childCount,
10666
- "mutator.top_trials": args.topTrials.length,
10667
- "mutator.bottom_trials": args.bottomTrials.length,
10668
- "mutator.parent_score": args.parentAggregate.meanScore,
10669
- "eval.phase": "mutator"
10670
- }
10671
- });
10672
- try {
10673
- const children = await adapter.mutate(args);
10674
- await span.end({
10675
- attributes: {
10676
- "mutator.parent_id": args.parent.id,
10677
- "mutator.generation": args.generation,
10678
- "mutator.child_count": args.childCount,
10679
- "mutator.top_trials": args.topTrials.length,
10680
- "mutator.bottom_trials": args.bottomTrials.length,
10681
- "mutator.parent_score": args.parentAggregate.meanScore,
10682
- "mutator.produced_count": children.length,
10683
- "mutator.child_ids": children.map((c) => c.id).join(","),
10684
- "eval.phase": "mutator"
10685
- }
10686
- });
10687
- return children;
10688
- } catch (err) {
10689
- await span.fail(err instanceof Error ? err : String(err));
10690
- throw err;
10691
- }
10692
- }
10693
- };
10694
- }
10695
10232
  export {
10696
10233
  AGENT_PROFILE_KINDS,
10697
10234
  ANALYST_SEVERITIES,
@@ -10709,7 +10246,6 @@ export {
10709
10246
  CaptureIntegrityError,
10710
10247
  ConfigError,
10711
10248
  ConvergenceTracker,
10712
- CostLedger,
10713
10249
  CostTracker,
10714
10250
  D1ExperimentStore,
10715
10251
  DEFAULT_AGENT_SLOS,
@@ -10750,15 +10286,12 @@ export {
10750
10286
  InMemoryFeedbackTrajectoryStore,
10751
10287
  InMemoryRawProviderSink,
10752
10288
  InMemoryTraceStore,
10753
- InMemoryTrialCache,
10754
10289
  InMemoryWorkspaceInspector,
10755
- JsonlTrialCache,
10756
10290
  JudgeError,
10757
10291
  JudgeRunner,
10758
10292
  KIND_EXPECTED_SUBJECTS,
10759
10293
  KNOWLEDGE_GAP_KIND_SPEC,
10760
10294
  KNOWLEDGE_POISONING_KIND_SPEC,
10761
- LineageRecorder,
10762
10295
  LlmCallError,
10763
10296
  LlmClient,
10764
10297
  LlmRouteAssertionError,
@@ -10766,7 +10299,6 @@ export {
10766
10299
  MODEL_PRICING,
10767
10300
  MetricsCollector,
10768
10301
  MultiLayerVerifier,
10769
- MutationTelemetry,
10770
10302
  Mutex,
10771
10303
  NoopRawProviderSink,
10772
10304
  NoopResearcher,
@@ -10800,7 +10332,6 @@ export {
10800
10332
  TraceEmitter,
10801
10333
  TraceFileMissingError,
10802
10334
  TraceNotFoundError,
10803
- TrialTelemetry,
10804
10335
  UNIVERSAL_FINDERS,
10805
10336
  ValidationError,
10806
10337
  VerificationError,
@@ -10812,7 +10343,6 @@ export {
10812
10343
  aggregateLlm,
10813
10344
  aggregatePrReviewScore,
10814
10345
  aggregateRunScore,
10815
- aggregateTrialsByMode,
10816
10346
  allCriticalPassed,
10817
10347
  analyzeAntiSlop,
10818
10348
  analyzeSeries,
@@ -10881,7 +10411,6 @@ export {
10881
10411
  corpusInterRaterAgreementFromJudgeScores,
10882
10412
  createAntiSlopJudge,
10883
10413
  createChatClient,
10884
- createCompositeMutator,
10885
10414
  createCustomJudge,
10886
10415
  createDefaultReviewer,
10887
10416
  createDomainExpertJudge,
@@ -10894,7 +10423,6 @@ export {
10894
10423
  createOtelTracingStore,
10895
10424
  createReplayFetch,
10896
10425
  createRunCriticAdapter,
10897
- createSandboxCodeMutator,
10898
10426
  createSandboxPool,
10899
10427
  createSemanticConceptJudge,
10900
10428
  createSemanticConceptJudgeAdapter,
@@ -10908,7 +10436,6 @@ export {
10908
10436
  decideReferenceReplayRunPromotion,
10909
10437
  defaultIsMaterial,
10910
10438
  defaultJudges,
10911
- defaultMultiShotObjectives,
10912
10439
  defaultProviderRedactor,
10913
10440
  defaultReferenceReplayMatcher,
10914
10441
  defaultTraceInsightPanel,
@@ -10966,11 +10493,6 @@ export {
10966
10493
  inMemoryReferenceReplayStore,
10967
10494
  inMemoryReviewStore,
10968
10495
  inferDomainKeywords,
10969
- integrationAsi,
10970
- integrationGateEvals,
10971
- integrationInvokeFailedPayload,
10972
- integrationManifestResolvedPayload,
10973
- integrationManifestValidatedPayload,
10974
10496
  interRaterReliability,
10975
10497
  iqr,
10976
10498
  isJudgeSpan,
@@ -11048,7 +10570,6 @@ export {
11048
10570
  referenceReplayScenarioToRunScore,
11049
10571
  regexMatch,
11050
10572
  regexMatches,
11051
- releaseTraceEvidenceFromMultiShotTrials,
11052
10573
  renderFindingSubject,
11053
10574
  renderMarkdown,
11054
10575
  renderMarkdownReport,
@@ -11083,9 +10604,6 @@ export {
11083
10604
  runKeywordCoverageJudge,
11084
10605
  runKeywordCoverageJudgeUrl,
11085
10606
  runLiveProof,
11086
- runMultiShotOptimization,
11087
- runProductionLoop,
11088
- runPromptEvolution,
11089
10607
  runProposeReview,
11090
10608
  runProposeReviewAsControlLoop,
11091
10609
  runReferenceReplay,
@@ -11134,9 +10652,7 @@ export {
11134
10652
  traceAnalystOnRunComplete,
11135
10653
  traceJudge,
11136
10654
  traceJudgeEnsemble,
11137
- traceMutator,
11138
10655
  tracedAnalyzeTraces,
11139
- trialTraceFromMultiShotTrial,
11140
10656
  typoMutator,
11141
10657
  urlContains,
11142
10658
  userQuestionsForKnowledgeGaps,