@tangle-network/agent-eval 0.71.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +63 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +485 -9
  11. package/dist/campaign/index.js +618 -30
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
  19. package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
  29. package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
  30. package/dist/contract/index.d.ts +17 -13
  31. package/dist/contract/index.js +14 -8
  32. package/dist/contract/index.js.map +1 -1
  33. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  34. package/dist/control.d.ts +2 -2
  35. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  36. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  37. package/dist/hosted/index.d.ts +223 -2
  38. package/dist/index.d.ts +49 -1323
  39. package/dist/index.js +339 -2627
  40. package/dist/index.js.map +1 -1
  41. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  42. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  43. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  44. package/dist/openapi.json +1 -1
  45. package/dist/pareto-E-pembql.d.ts +81 -0
  46. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  47. package/dist/redact-B40YG2M_.d.ts +45 -0
  48. package/dist/registry-DuVYiTvw.d.ts +128 -0
  49. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  50. package/dist/rl.d.ts +4 -3
  51. package/dist/rl.js +4 -4
  52. package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
  53. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  54. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  55. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  56. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  57. package/dist/traces.d.ts +371 -308
  58. package/dist/traces.js +43 -18
  59. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  60. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  61. package/dist/wire/index.d.ts +1 -1
  62. package/dist/workflow/index.d.ts +494 -0
  63. package/dist/workflow/index.js +2177 -0
  64. package/dist/workflow/index.js.map +1 -0
  65. package/docs/design/self-improvement-roadmap.md +106 -0
  66. package/package.json +36 -12
  67. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  68. package/dist/chunk-6QZUCFKM.js.map +0 -1
  69. package/dist/chunk-ODGETRTM.js.map +0 -1
  70. package/dist/chunk-PQV2TKC3.js +0 -27
  71. package/dist/chunk-PQV2TKC3.js.map +0 -1
  72. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
  73. /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
@@ -1,4 +1,5 @@
1
1
  import {
2
+ buildEvidenceVector,
2
3
  composeGate,
3
4
  defaultProductionGate,
4
5
  detectScale,
@@ -6,11 +7,17 @@ import {
6
7
  evolutionaryDriver,
7
8
  heldoutSignificance,
8
9
  pairHoldout,
10
+ paretoPolicy,
11
+ paretoSignificanceGate,
9
12
  runEval
10
- } from "../chunk-6QZUCFKM.js";
13
+ } from "../chunk-XPILG2CA.js";
11
14
  import {
12
- agentProfileHash
13
- } from "../chunk-PQV2TKC3.js";
15
+ agentProfileHash,
16
+ estimateCost,
17
+ extractProducedState,
18
+ isModelPriced,
19
+ verifyCompletion
20
+ } from "../chunk-LB2UOI5F.js";
14
21
  import {
15
22
  buildLoopProvenanceRecord,
16
23
  campaignBreakdown,
@@ -27,36 +34,454 @@ import {
27
34
  openAutoPr,
28
35
  provenanceRecordPath,
29
36
  provenanceSpansPath,
37
+ renderAnalystEvidence,
30
38
  runImprovementLoop,
31
39
  runOptimization,
32
40
  surfaceContentHash,
33
41
  surfaceHash
34
- } from "../chunk-VMAYE3LM.js";
42
+ } from "../chunk-JYE3WOTE.js";
35
43
  import {
36
44
  assertRealBackend,
37
45
  fsCampaignStorage,
38
46
  inMemoryCampaignStorage,
39
47
  runCampaign,
40
48
  summarizeBackendIntegrity
41
- } from "../chunk-6XQIEUQ2.js";
49
+ } from "../chunk-ZPSKPT3V.js";
50
+ import {
51
+ AnalystRegistry,
52
+ DEFAULT_TRACE_ANALYST_KINDS,
53
+ createTraceAnalystKind
54
+ } from "../chunk-WYIHD6EB.js";
42
55
  import "../chunk-YV7J7X5N.js";
43
56
  import {
44
- validateRunRecord
45
- } from "../chunk-F3SRAAZO.js";
57
+ callLlm
58
+ } from "../chunk-IHDHUN2X.js";
46
59
  import {
47
60
  pairedBootstrap
48
61
  } from "../chunk-ITBRCT73.js";
49
62
  import "../chunk-GGE4NNQT.js";
50
- import "../chunk-VSMTAMNK.js";
51
63
  import {
52
- callLlm
53
- } from "../chunk-IHDHUN2X.js";
64
+ OtlpFileTraceStore,
65
+ analyzeTraces
66
+ } from "../chunk-VUINJM5M.js";
54
67
  import "../chunk-PC4UYEBM.js";
68
+ import {
69
+ validateRunRecord
70
+ } from "../chunk-F3SRAAZO.js";
71
+ import "../chunk-VSMTAMNK.js";
55
72
  import {
56
73
  AgentEvalError
57
74
  } from "../chunk-3BFEG2F6.js";
58
75
  import "../chunk-PZ5AY32C.js";
59
76
 
77
+ // src/campaign/analyst-surface.ts
78
+ function surfaceToText(surface) {
79
+ if (typeof surface === "string") return surface;
80
+ throw new Error(
81
+ `buildAnalystSurfaceDispatch: the analyst surface must be a string actorDescription, got a ${surface.kind}-tier surface (${surface.worktreeRef}). The analyst prompt is prompt-tier.`
82
+ );
83
+ }
84
+ function buildAnalystSurfaceDispatch(opts) {
85
+ const analyze = opts.analyze ?? analyzeTraces;
86
+ return async (surface, scenario, _ctx) => {
87
+ const actorDescription = surfaceToText(surface);
88
+ const res = await analyze(
89
+ { question: scenario.question },
90
+ { ...opts.analystOptions, actorDescription, source: scenario.source }
91
+ );
92
+ return {
93
+ answer: res.answer,
94
+ findings: res.findings,
95
+ actorPromptVersion: res.actorPromptVersion
96
+ };
97
+ };
98
+ }
99
+ function failureModeRecallJudge(opts = {}) {
100
+ const recallWeight = opts.recallWeight ?? 0.5;
101
+ return {
102
+ name: "failure-mode-recall",
103
+ dimensions: [
104
+ { key: "recall", description: "fraction of ground-truth failure modes the analyst surfaced" },
105
+ {
106
+ key: "precision",
107
+ description: "1 \u2212 share of findings that named a failure/tool/error absent from this corpus"
108
+ }
109
+ ],
110
+ appliesTo: (s) => s.kind === "analyst-surface",
111
+ score({ artifact, scenario }) {
112
+ const modes = scenario.expectedFailureModes;
113
+ if (modes.length === 0) {
114
+ throw new Error(
115
+ `failureModeRecallJudge: scenario '${scenario.id}' has no expectedFailureModes \u2014 refusing to score (a vacuous 1.0 would corrupt the comparison)`
116
+ );
117
+ }
118
+ const hay = artifact.findings.join("\n").toLowerCase();
119
+ const matched = modes.filter((m) => m.cues.some((c) => hay.includes(c.toLowerCase())));
120
+ const recall = matched.length / modes.length;
121
+ const forbidden = (scenario.forbiddenCues ?? []).map((c) => c.toLowerCase());
122
+ let precision = 1;
123
+ let hallucinated = 0;
124
+ if (forbidden.length > 0) {
125
+ const denom = Math.max(1, artifact.findings.length);
126
+ hallucinated = artifact.findings.filter(
127
+ (f) => forbidden.some((c) => f.toLowerCase().includes(c))
128
+ ).length;
129
+ precision = 1 - hallucinated / denom;
130
+ }
131
+ const composite = forbidden.length > 0 ? recallWeight * recall + (1 - recallWeight) * precision : recall;
132
+ const missed = modes.filter((m) => !matched.includes(m)).map((m) => m.id);
133
+ const notes = `matched ${matched.length}/${modes.length} failure modes` + (missed.length ? `; missed [${missed.join(", ")}]` : "") + (hallucinated ? `; ${hallucinated} out-of-corpus finding(s)` : "");
134
+ return { dimensions: { recall, precision }, composite, notes };
135
+ }
136
+ };
137
+ }
138
+
139
+ // src/campaign/drivers/_findings-text.ts
140
+ function findingToLesson(f) {
141
+ if (typeof f === "string") return f.trim() || null;
142
+ if (f && typeof f === "object") {
143
+ const o = f;
144
+ const cand = o.recommended_action ?? o.claim ?? o.lesson ?? o.text ?? o.message;
145
+ if (typeof cand === "string" && cand.trim()) return cand.trim();
146
+ }
147
+ return null;
148
+ }
149
+ function normKey(s) {
150
+ return s.toLowerCase().replace(/\s+/g, " ").replace(/[.;:!?\s]+$/, "").trim();
151
+ }
152
+ function surfaceToText2(surface) {
153
+ if (typeof surface === "string") return surface;
154
+ throw new Error(
155
+ `curator driver: surface must be a string prompt, got a ${surface.kind}-tier surface (${surface.worktreeRef}) \u2014 curation is prompt-tier`
156
+ );
157
+ }
158
+
159
+ // src/campaign/drivers/ace.ts
160
+ var BLOCK_START = "<!-- BEGIN ace-playbook (auto-managed by aceDriver) -->";
161
+ var BLOCK_END = "<!-- END ace-playbook -->";
162
+ var DEFAULT_HEADING = "## Playbook (accumulated lessons \u2014 append-only)";
163
+ function parsePlaybook(surface) {
164
+ const start = surface.indexOf(BLOCK_START);
165
+ const end = surface.indexOf(BLOCK_END);
166
+ if (start === -1 || end === -1 || end < start) return [];
167
+ const body = surface.slice(start + BLOCK_START.length, end);
168
+ const out = [];
169
+ for (const raw of body.split("\n")) {
170
+ const line = raw.trim();
171
+ if (!line.startsWith("- ")) continue;
172
+ const item = line.slice(2).trim();
173
+ const tag = /^\[g(-?\d+)\]\s*(.*)$/.exec(item);
174
+ if (tag) out.push({ gen: Number(tag[1]), text: tag[2].trim() });
175
+ else out.push({ gen: -1, text: item });
176
+ }
177
+ return out;
178
+ }
179
+ function stripBlock(surface) {
180
+ const start = surface.indexOf(BLOCK_START);
181
+ const end = surface.indexOf(BLOCK_END);
182
+ if (start === -1 || end === -1 || end < start) return surface.trimEnd();
183
+ return (surface.slice(0, start) + surface.slice(end + BLOCK_END.length)).trimEnd();
184
+ }
185
+ function aceDriver(opts = {}) {
186
+ const maxEntries = opts.maxEntries ?? 50;
187
+ if (maxEntries < 1) throw new Error("aceDriver: maxEntries must be >= 1");
188
+ const heading = opts.sectionHeading ?? DEFAULT_HEADING;
189
+ return {
190
+ kind: "ace",
191
+ async propose(ctx) {
192
+ const parent = surfaceToText2(ctx.currentSurface);
193
+ const existing = parsePlaybook(parent);
194
+ const seen = new Set(existing.map((b) => normKey(b.text)));
195
+ const fresh = [];
196
+ for (const f of ctx.findings ?? []) {
197
+ const lesson = findingToLesson(f);
198
+ if (!lesson) continue;
199
+ const k = normKey(lesson);
200
+ if (!k || seen.has(k)) continue;
201
+ seen.add(k);
202
+ fresh.push({ gen: ctx.generation, text: lesson });
203
+ }
204
+ if (fresh.length === 0) return [];
205
+ const all = [...existing, ...fresh].slice(-maxEntries);
206
+ const block = [
207
+ BLOCK_START,
208
+ heading,
209
+ ...all.map((b) => `- [g${b.gen}] ${b.text}`),
210
+ BLOCK_END
211
+ ].join("\n");
212
+ const base = stripBlock(parent);
213
+ const surface = base ? `${base}
214
+
215
+ ${block}` : block;
216
+ return [
217
+ {
218
+ surface,
219
+ label: `ace-playbook +${fresh.length}`,
220
+ rationale: `appended ${fresh.length} new lesson(s) from gen ${ctx.generation} findings (playbook now ${all.length} bullet(s), append-only)`
221
+ }
222
+ ];
223
+ }
224
+ };
225
+ }
226
+
227
+ // src/campaign/drivers/guide.ts
228
+ var DRIVER_GUIDE = {
229
+ gepa: {
230
+ summary: "Reflective full-surface rewrite: reflects on the best parent\u2019s weakest dimensions + per-scenario scores, proposes targeted rewrites, maintains a Pareto frontier across generations.",
231
+ surface: "prompt",
232
+ strategy: "reflective-rewrite",
233
+ whenUse: "The default for a prompt/instruction surface with headroom \u2014 broad rewrites plus Pareto-optimal exploration across scenarios.",
234
+ cost: "medium"
235
+ },
236
+ skillOpt: {
237
+ summary: "Patch-mode: bounded, anchored add/delete/replace edits to ONE skill document, so a good rule introduced earlier is not clobbered by a later sweeping rewrite.",
238
+ surface: "skill-doc",
239
+ strategy: "anchored-patch",
240
+ whenUse: 'Refining a skill document incrementally where accumulated rules must be preserved; the edit budget is the "textual learning rate".',
241
+ cost: "medium"
242
+ },
243
+ ace: {
244
+ summary: "Append-mostly playbook curator: grows the playbook with provenance-tagged delta bullets, never merging \u2014 guards against context collapse.",
245
+ surface: "playbook",
246
+ strategy: "append-only",
247
+ whenUse: "Accumulating many specific, hard-won lessons over time where dedup/rewrite would summarize away detail.",
248
+ cost: "low"
249
+ },
250
+ memoryCuration: {
251
+ summary: "Dedup-and-rank curator: builds a compact searchable memory and grafts the most relevant, most-recurrent lessons onto the surface.",
252
+ surface: "memory",
253
+ strategy: "dedup-curate",
254
+ whenUse: "Accumulating lessons while keeping the surface compact \u2014 the complement to ace when context size matters more than verbatim provenance.",
255
+ cost: "low"
256
+ },
257
+ halo: {
258
+ summary: "Wraps the real external HALO engine (Inference.net, `halo` CLI) and applies its findings to the prompt via one LLM edit.",
259
+ surface: "prompt",
260
+ strategy: "analysis-edit",
261
+ whenUse: "Benchmarking: compete HALO head-to-head against our own analysis on identical traces via compareDrivers.",
262
+ cost: "high",
263
+ external: true
264
+ },
265
+ traceAnalyst: {
266
+ summary: "Wraps agent-eval\u2019s own trace-analyst engine and applies its findings to the prompt via one identical LLM edit \u2014 the symmetric opponent to haloDriver.",
267
+ surface: "prompt",
268
+ strategy: "analysis-edit",
269
+ whenUse: "Benchmarking our trace-analyst\u2019s analysis quality against HALO (analysis-quality head-to-head), or improving from a real OTLP trace corpus.",
270
+ cost: "high"
271
+ },
272
+ evolutionary: {
273
+ summary: "Adapts a stateless Mutator (population mutate \u2192 measure \u2192 select); no generation memory beyond the current surface.",
274
+ surface: "any",
275
+ strategy: "population-mutate",
276
+ whenUse: "Blind population search when you have a Mutator and don\u2019t need reflective reasoning over findings.",
277
+ cost: "medium"
278
+ }
279
+ };
280
+ var GOAL_RANK = {
281
+ explore: ["gepa", "evolutionary"],
282
+ refine: ["skillOpt", "gepa"],
283
+ accumulate: ["ace", "memoryCuration"],
284
+ benchmark: ["traceAnalyst", "halo"]
285
+ };
286
+ function selectDriver(criteria) {
287
+ const ranked = GOAL_RANK[criteria.goal];
288
+ const out = [];
289
+ for (const name of ranked) {
290
+ const entry = DRIVER_GUIDE[name];
291
+ if (criteria.surface && criteria.surface !== "any" && entry.surface !== criteria.surface)
292
+ continue;
293
+ out.push({
294
+ name,
295
+ entry,
296
+ reason: `${criteria.goal}: ${entry.strategy} on the ${entry.surface} surface \u2014 ${entry.whenUse}`
297
+ });
298
+ }
299
+ if (out.length === 0 && criteria.surface) {
300
+ for (const name of Object.keys(DRIVER_GUIDE)) {
301
+ const entry = DRIVER_GUIDE[name];
302
+ if (entry.surface === criteria.surface || entry.surface === "any") {
303
+ out.push({ name, entry, reason: `surface match (${entry.surface}): ${entry.whenUse}` });
304
+ }
305
+ }
306
+ }
307
+ return out;
308
+ }
309
+
310
+ // src/campaign/drivers/halo.ts
311
+ import { execFile } from "child_process";
312
+ import { mkdtempSync, writeFileSync } from "fs";
313
+ import { tmpdir } from "os";
314
+ import { join } from "path";
315
+ import { promisify } from "util";
316
+ var execFileAsync = promisify(execFile);
317
+ var DEFAULT_ANALYSIS_PROMPT = "Diagnose the failures in these agent execution traces \u2014 hallucinated tool calls, redundant tool arguments, refusal loops, and semantic-correctness errors \u2014 and suggest concrete, generalizable fixes to the agent instructions.";
318
+ var APPLY_SYSTEM = "You apply a trace-analysis report to an agent instruction prompt. Output ONLY the full revised prompt \u2014 no preamble, no commentary, no code fences. Make the minimal edits that address the report findings; preserve everything else verbatim.";
319
+ function haloDriver(opts) {
320
+ const haloBin = opts.haloBin ?? "halo";
321
+ const model = opts.model ?? "gpt-5.4-mini";
322
+ return {
323
+ kind: "halo",
324
+ async propose(ctx) {
325
+ const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
326
+ const traces = await opts.resolveTraces(ctx) ?? "";
327
+ if (!traces.trim()) {
328
+ throw new Error(
329
+ "haloDriver: resolveTraces returned no OTLP traces \u2014 the halo engine has nothing to analyze"
330
+ );
331
+ }
332
+ const dir = mkdtempSync(join(tmpdir(), "halo-driver-"));
333
+ const tracePath = join(dir, "traces.jsonl");
334
+ writeFileSync(tracePath, traces.endsWith("\n") ? traces : `${traces}
335
+ `);
336
+ const args = [
337
+ tracePath,
338
+ "-p",
339
+ opts.analysisPrompt ?? DEFAULT_ANALYSIS_PROMPT,
340
+ "-m",
341
+ model,
342
+ ...opts.maxDepth !== void 0 ? ["--max-depth", String(opts.maxDepth)] : [],
343
+ ...opts.maxTurns !== void 0 ? ["--max-turns", String(opts.maxTurns)] : []
344
+ ];
345
+ let findings;
346
+ try {
347
+ const { stdout } = await execFileAsync(haloBin, args, {
348
+ maxBuffer: 64 * 1024 * 1024,
349
+ signal: ctx.signal,
350
+ env: {
351
+ ...process.env,
352
+ ...opts.apiKey ? { OPENAI_API_KEY: opts.apiKey } : {},
353
+ OPENAI_BASE_URL: opts.baseUrl
354
+ }
355
+ });
356
+ findings = stdout.trim();
357
+ } catch (e) {
358
+ throw new Error(
359
+ `haloDriver: halo-engine ('${haloBin}') failed \u2014 ${e instanceof Error ? e.message : String(e)}`
360
+ );
361
+ }
362
+ if (!findings) throw new Error("haloDriver: halo-engine produced no findings");
363
+ const applied = await callLlm(
364
+ {
365
+ model: opts.applyModel ?? model,
366
+ messages: [
367
+ { role: "system", content: APPLY_SYSTEM },
368
+ {
369
+ role: "user",
370
+ content: `CURRENT PROMPT:
371
+ ${parent}
372
+
373
+ HALO TRACE-ANALYSIS REPORT:
374
+ ${findings}
375
+
376
+ Return the full revised prompt.`
377
+ }
378
+ ]
379
+ },
380
+ { baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl }
381
+ );
382
+ const text = applied.content.trim();
383
+ if (!text || text === parent) return [];
384
+ return [
385
+ {
386
+ surface: text,
387
+ label: "halo",
388
+ rationale: `halo-engine findings:
389
+ ${findings.slice(0, 800)}`
390
+ }
391
+ ];
392
+ }
393
+ };
394
+ }
395
+
396
+ // src/campaign/drivers/memory.ts
397
+ var BLOCK_START2 = "<!-- BEGIN curated-memory (auto-managed by memoryCurationDriver) -->";
398
+ var BLOCK_END2 = "<!-- END curated-memory -->";
399
+ var DEFAULT_HEADING2 = "## Learned from prior runs (curated memory)";
400
+ var DISTILL_SYSTEM = 'You compress raw trace-analysis findings into crisp, generalizable agent guidance. Output ONLY a JSON array of strings, each one imperative lesson the agent should follow (e.g. "Always fetch a resource before mutating it"). No prose outside the JSON. Deduplicate; keep the most actionable and general; drop case-specific noise.';
401
+ function extractExistingLessons(text) {
402
+ const start = text.indexOf(BLOCK_START2);
403
+ const end = text.indexOf(BLOCK_END2);
404
+ if (start === -1 || end === -1 || end < start) return [];
405
+ return text.slice(start + BLOCK_START2.length, end).split("\n").map((l) => l.replace(/^\s*-\s+/, "").trim()).filter((l) => l && !l.startsWith("#"));
406
+ }
407
+ function stripBlock2(text) {
408
+ const start = text.indexOf(BLOCK_START2);
409
+ const end = text.indexOf(BLOCK_END2);
410
+ if (start === -1 || end === -1 || end < start) return text.trimEnd();
411
+ return (text.slice(0, start) + text.slice(end + BLOCK_END2.length)).trimEnd();
412
+ }
413
+ async function distillLessons(raw, distill) {
414
+ const res = await callLlm(
415
+ {
416
+ model: distill.model,
417
+ messages: [
418
+ { role: "system", content: DISTILL_SYSTEM },
419
+ { role: "user", content: `Findings:
420
+ ${raw.map((r) => `- ${r}`).join("\n")}` }
421
+ ]
422
+ },
423
+ { baseUrl: distill.baseUrl, apiKey: distill.apiKey, fetch: distill.fetchImpl }
424
+ );
425
+ try {
426
+ const parsed = JSON.parse(res.content.trim());
427
+ if (Array.isArray(parsed)) {
428
+ const lessons = parsed.filter(
429
+ (x) => typeof x === "string" && x.trim().length > 0
430
+ );
431
+ if (lessons.length > 0) return lessons;
432
+ }
433
+ } catch {
434
+ }
435
+ return raw;
436
+ }
437
+ function memoryCurationDriver(opts = {}) {
438
+ const maxEntries = opts.maxEntries ?? 12;
439
+ const heading = opts.sectionHeading ?? DEFAULT_HEADING2;
440
+ return {
441
+ kind: "memory-curation",
442
+ async propose(ctx) {
443
+ const parent = surfaceToText2(ctx.currentSurface);
444
+ const fresh = [];
445
+ for (const f of ctx.findings ?? []) {
446
+ const l = findingToLesson(f);
447
+ if (l) fresh.push(l);
448
+ }
449
+ const carried = extractExistingLessons(parent);
450
+ if (fresh.length === 0 && carried.length === 0) return [];
451
+ const distilled = opts.distill && fresh.length > 0 ? await distillLessons(fresh, opts.distill) : fresh;
452
+ const byKey = /* @__PURE__ */ new Map();
453
+ for (const l of carried) {
454
+ const k = normKey(l);
455
+ if (k) byKey.set(k, { text: l, count: 1 });
456
+ }
457
+ for (const l of distilled) {
458
+ const k = normKey(l);
459
+ if (!k) continue;
460
+ const e = byKey.get(k);
461
+ if (e) e.count += 1;
462
+ else byKey.set(k, { text: l, count: 1 });
463
+ }
464
+ const ranked = [...byKey.values()].sort((a, b) => b.count - a.count || a.text.localeCompare(b.text)).slice(0, maxEntries);
465
+ if (ranked.length === 0) return [];
466
+ const block = [BLOCK_START2, heading, ...ranked.map((e) => `- ${e.text}`), BLOCK_END2].join(
467
+ "\n"
468
+ );
469
+ const next = `${stripBlock2(parent)}
470
+
471
+ ${block}
472
+ `;
473
+ if (next === parent) return [];
474
+ return [
475
+ {
476
+ surface: next,
477
+ label: "memory-curation",
478
+ rationale: `curated ${ranked.length} lessons (from ${fresh.length} new finding(s) + ${carried.length} carried)`
479
+ }
480
+ ];
481
+ }
482
+ };
483
+ }
484
+
60
485
  // src/campaign/skill-patch.ts
61
486
  function applySkillPatch(surface, patch) {
62
487
  let lines = surface.split("\n");
@@ -126,6 +551,7 @@ function skillOptDriver(opts) {
126
551
  editBudget: args.editBudget,
127
552
  rejectedBuffer: args.rejectedBuffer,
128
553
  metaNote: args.metaNote,
554
+ findingsNote: args.findingsNote,
129
555
  count: args.count
130
556
  });
131
557
  const result = await callLlm(
@@ -158,6 +584,7 @@ function skillOptDriver(opts) {
158
584
  evidence: evidenceFromHistory(ctx, evidenceK),
159
585
  editBudget: defaultBudget,
160
586
  rejectedBuffer: [],
587
+ findingsNote: renderAnalystEvidence(ctx.findings, ctx.report) ?? void 0,
161
588
  count: ctx.populationSize,
162
589
  signal: ctx.signal
163
590
  });
@@ -218,6 +645,9 @@ function buildPatchPrompt(args) {
218
645
  ...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
219
646
  );
220
647
  }
648
+ if (args.findingsNote) {
649
+ lines.push("", args.findingsNote);
650
+ }
221
651
  if (args.metaNote) {
222
652
  lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
223
653
  }
@@ -290,10 +720,105 @@ function snippet(s, max = 120) {
290
720
  return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
291
721
  }
292
722
 
723
+ // src/campaign/drivers/trace-analyst.ts
724
+ import { mkdtempSync as mkdtempSync2, writeFileSync as writeFileSync2 } from "fs";
725
+ import { tmpdir as tmpdir2 } from "os";
726
+ import { join as join2 } from "path";
727
+ import { ai } from "@ax-llm/ax";
728
+ var APPLY_SYSTEM2 = "You apply a trace-analysis report to an agent instruction prompt. Output ONLY the full revised prompt \u2014 no preamble, no commentary, no code fences. Make the minimal edits that address the report findings; preserve everything else verbatim.";
729
+ function renderFindings(findings) {
730
+ return findings.map((f, i) => {
731
+ const action = f.recommended_action ? `
732
+ FIX: ${f.recommended_action}` : "";
733
+ const subject = f.subject ? ` (${f.subject})` : "";
734
+ return `${i + 1}. [${f.severity}/${f.area}]${subject} ${f.claim}${action}`;
735
+ }).join("\n");
736
+ }
737
+ function traceAnalystDriver(opts) {
738
+ if (!opts.apiKey) throw new Error("traceAnalystDriver: apiKey is required");
739
+ if (!opts.model) throw new Error("traceAnalystDriver: model is required");
740
+ const kinds = opts.kinds ?? DEFAULT_TRACE_ANALYST_KINDS;
741
+ return {
742
+ kind: "trace-analyst",
743
+ async propose(ctx) {
744
+ const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
745
+ const traces = await opts.resolveTraces(ctx) ?? "";
746
+ if (!traces.trim()) {
747
+ throw new Error(
748
+ "traceAnalystDriver: resolveTraces returned no OTLP traces \u2014 the analyst has nothing to read"
749
+ );
750
+ }
751
+ const dir = mkdtempSync2(join2(tmpdir2(), "trace-analyst-driver-"));
752
+ const tracePath = join2(dir, "traces.jsonl");
753
+ writeFileSync2(tracePath, traces.endsWith("\n") ? traces : `${traces}
754
+ `);
755
+ const runAnalyze = opts.analyze ?? (async (path, c) => {
756
+ const aiService = ai({
757
+ name: opts.provider ?? "openai",
758
+ apiKey: opts.apiKey,
759
+ apiURL: opts.baseUrl,
760
+ config: { model: opts.model }
761
+ });
762
+ const registry = new AnalystRegistry();
763
+ for (const spec of kinds) {
764
+ registry.register(createTraceAnalystKind(spec, { ai: aiService, model: opts.model }));
765
+ }
766
+ const result = await registry.run(
767
+ `trace-analyst-gen-${c.generation}`,
768
+ { traceStore: new OtlpFileTraceStore({ path }) },
769
+ { signal: c.signal }
770
+ );
771
+ return result.findings;
772
+ });
773
+ let findings;
774
+ try {
775
+ findings = await runAnalyze(tracePath, ctx);
776
+ } catch (e) {
777
+ throw new Error(
778
+ `traceAnalystDriver: analyst engine failed \u2014 ${e instanceof Error ? e.message : String(e)}`
779
+ );
780
+ }
781
+ if (findings.length === 0) {
782
+ throw new Error("traceAnalystDriver: analyst engine produced no findings");
783
+ }
784
+ const report = renderFindings(findings);
785
+ const applied = await callLlm(
786
+ {
787
+ model: opts.applyModel ?? opts.model,
788
+ messages: [
789
+ { role: "system", content: APPLY_SYSTEM2 },
790
+ {
791
+ role: "user",
792
+ content: `CURRENT PROMPT:
793
+ ${parent}
794
+
795
+ TRACE-ANALYSIS REPORT:
796
+ ${report}
797
+
798
+ Return the full revised prompt.`
799
+ }
800
+ ]
801
+ },
802
+ { baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl }
803
+ );
804
+ const text = applied.content.trim();
805
+ if (!text || text === parent) return [];
806
+ return [
807
+ {
808
+ surface: text,
809
+ label: "trace-analyst",
810
+ rationale: `trace-analyst findings (${findings.length}):
811
+ ${report.slice(0, 800)}`
812
+ }
813
+ ];
814
+ }
815
+ };
816
+ }
817
+
293
818
  // src/campaign/labeled-store/fs-adapter.ts
294
819
  import { createHash } from "crypto";
295
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
296
- import { join } from "path";
820
+ import { existsSync, mkdirSync, readFileSync, writeFileSync as writeFileSync3 } from "fs";
821
+ import { join as join3 } from "path";
297
822
  var LabeledScenarioStoreError = class extends Error {
298
823
  constructor(code, message) {
299
824
  super(message);
@@ -453,7 +978,7 @@ var FsLabeledScenarioStore = class {
453
978
  };
454
979
  }
455
980
  pathForSource(source) {
456
- return join(this.options.root, `${source}.jsonl`);
981
+ return join3(this.options.root, `${source}.jsonl`);
457
982
  }
458
983
  };
459
984
  var ALL_SOURCES = [
@@ -495,9 +1020,9 @@ function sha256(input) {
495
1020
  function appendLine(path, line) {
496
1021
  if (existsSync(path)) {
497
1022
  const existing = readFileSync(path, "utf8");
498
- writeFileSync(path, existing + line);
1023
+ writeFileSync3(path, existing + line);
499
1024
  } else {
500
- writeFileSync(path, line);
1025
+ writeFileSync3(path, line);
501
1026
  }
502
1027
  }
503
1028
 
@@ -798,7 +1323,13 @@ function gepaEntry(config, combineParents, name) {
798
1323
  }),
799
1324
  autoOnPromote: "none",
800
1325
  runDir: `${config.runDir}/${slug(name)}-loop`,
801
- ...config.seed !== void 0 ? { seed: config.seed } : {}
1326
+ ...config.seed !== void 0 ? { seed: config.seed } : {},
1327
+ // EYES→HANDS: flow findings to the driver's propose(). These reach
1328
+ // runOptimization unchanged (runImprovementLoop extends RunOptimizationOptions
1329
+ // and forwards {...opts}); ctx.findings/report/analyzeGeneration are consumed there.
1330
+ ...config.findings !== void 0 ? { findings: config.findings } : {},
1331
+ ...config.analyzeGeneration ? { analyzeGeneration: config.analyzeGeneration } : {},
1332
+ ...config.report !== void 0 ? { report: config.report } : {}
802
1333
  });
803
1334
  const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
804
1335
  (sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
@@ -834,9 +1365,41 @@ function skillOptEntry(config, name = "skill-opt") {
834
1365
  };
835
1366
  }
836
1367
 
1368
+ // src/campaign/presets/playback.ts
1369
+ function makePlaybackDispatch(driver) {
1370
+ return async (profile, scenario, ctx) => {
1371
+ const events = await driver.run(scenario, { ...ctx, profile });
1372
+ return extractProducedState(events);
1373
+ };
1374
+ }
1375
+ async function scoreUserStory(story, state, checkCorrectness) {
1376
+ const verdict = await verifyCompletion(
1377
+ { taskId: story.id, requirements: story.requirements },
1378
+ state,
1379
+ checkCorrectness
1380
+ );
1381
+ return { ...verdict, title: story.title };
1382
+ }
1383
+ function userStoryScoreboard(verdicts) {
1384
+ const rows = [];
1385
+ for (const v of verdicts) {
1386
+ for (const r of v.requirements) {
1387
+ rows.push({
1388
+ storyId: v.taskId,
1389
+ storyTitle: v.title,
1390
+ reqId: r.reqId,
1391
+ reqTitle: r.title,
1392
+ status: r.satisfied ? "PASS" : "FAIL",
1393
+ evidence: r.evidence
1394
+ });
1395
+ }
1396
+ }
1397
+ return rows;
1398
+ }
1399
+
837
1400
  // src/campaign/presets/run-profile-matrix.ts
838
1401
  import { createHash as createHash2 } from "crypto";
839
- import { join as join2 } from "path";
1402
+ import { join as join4 } from "path";
840
1403
  var ProfileMatrixError = class extends AgentEvalError {
841
1404
  constructor(message) {
842
1405
  super("profile_matrix", message);
@@ -873,15 +1436,22 @@ function buildRunRecord(args) {
873
1436
  }
874
1437
  const perDimMean = {};
875
1438
  for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
876
- raw.cost_usd = cell.costUsd;
1439
+ let costUsd = cell.costUsd;
1440
+ let costEstimated = false;
1441
+ if (costUsd === 0 && cell.tokenUsage.output > 0 && isModelPriced(profile.model)) {
1442
+ costUsd = estimateCost(cell.tokenUsage.input, cell.tokenUsage.output, profile.model);
1443
+ costEstimated = costUsd > 0;
1444
+ }
1445
+ raw.cost_usd = costUsd;
1446
+ raw.cost_estimated = costEstimated ? 1 : 0;
877
1447
  raw.tokens_input = cell.tokenUsage.input;
878
1448
  raw.tokens_output = cell.tokenUsage.output;
879
1449
  if (typeof cell.tokenUsage.cached === "number") raw.tokens_cached = cell.tokenUsage.cached;
880
1450
  raw.latency_ms = cell.durationMs;
881
- if (cell.costUsd > 0) {
882
- raw.tokens_per_dollar = (cell.tokenUsage.input + cell.tokenUsage.output) / cell.costUsd;
1451
+ if (costUsd > 0) {
1452
+ raw.tokens_per_dollar = (cell.tokenUsage.input + cell.tokenUsage.output) / costUsd;
883
1453
  }
884
- if (composite > 0.01) raw.cost_per_quality = cell.costUsd / composite;
1454
+ if (composite > 0.01) raw.cost_per_quality = costUsd / composite;
885
1455
  const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
886
1456
  if (Object.keys(perJudge).length > 0) {
887
1457
  outcome.judgeScores = {
@@ -901,7 +1471,7 @@ function buildRunRecord(args) {
901
1471
  configHash,
902
1472
  commitSha,
903
1473
  wallMs: cell.durationMs,
904
- costUsd: cell.costUsd,
1474
+ costUsd,
905
1475
  tokenUsage: cell.tokenUsage,
906
1476
  outcome,
907
1477
  splitTag,
@@ -980,9 +1550,8 @@ async function runProfileMatrix(opts) {
980
1550
  captureSource: opts.captureSource,
981
1551
  storage: opts.storage,
982
1552
  now: opts.now,
983
- runDir: join2(opts.runDir, sanitize(profile.id))
1553
+ runDir: join4(opts.runDir, sanitize(profile.id))
984
1554
  });
985
- campaigns[profile.id] = campaign;
986
1555
  const profileRecords = [];
987
1556
  for (const cell of campaign.cells) {
988
1557
  const record = buildRunRecord({
@@ -1001,13 +1570,18 @@ async function runProfileMatrix(opts) {
1001
1570
  profileRecords.push(record);
1002
1571
  records.push(record);
1003
1572
  }
1573
+ const pricedTotalCostUsd = profileRecords.reduce((a, r) => a + r.costUsd, 0);
1574
+ campaigns[profile.id] = {
1575
+ ...campaign,
1576
+ aggregates: { ...campaign.aggregates, totalCostUsd: pricedTotalCostUsd }
1577
+ };
1004
1578
  byProfile[profile.id] = {
1005
1579
  profileId: profile.id,
1006
1580
  profileHash,
1007
1581
  model: profile.model,
1008
1582
  records: profileRecords.length,
1009
1583
  meanComposite: mean2(profileRecords.map(compositeOf)),
1010
- totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
1584
+ totalCostUsd: pricedTotalCostUsd,
1011
1585
  integrity: summarizeBackendIntegrity(profileRecords)
1012
1586
  };
1013
1587
  }
@@ -1048,7 +1622,7 @@ function rollupByPersona(records, scenarios, personaOf) {
1048
1622
  // src/campaign/worktree/index.ts
1049
1623
  import { execFileSync } from "child_process";
1050
1624
  import { existsSync as existsSync2 } from "fs";
1051
- import { basename, isAbsolute, join as join3 } from "path";
1625
+ import { basename, isAbsolute, join as join5 } from "path";
1052
1626
  var WorktreeAdapterError = class extends Error {
1053
1627
  constructor(message, cause) {
1054
1628
  super(message);
@@ -1070,13 +1644,13 @@ function slug2(label) {
1070
1644
  }
1071
1645
  function gitWorktreeAdapter(opts) {
1072
1646
  const git = opts.git ?? defaultGit;
1073
- const worktreeDir = opts.worktreeDir ?? join3(opts.repoRoot, ".worktrees");
1647
+ const worktreeDir = opts.worktreeDir ?? join5(opts.repoRoot, ".worktrees");
1074
1648
  const branchPrefix = opts.branchPrefix ?? "improve";
1075
1649
  return {
1076
1650
  async create({ baseRef, label }) {
1077
1651
  const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
1078
1652
  const branch = `${branchPrefix}/${id}`;
1079
- const path = join3(worktreeDir, id);
1653
+ const path = join5(worktreeDir, id);
1080
1654
  git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
1081
1655
  return { path, branch, baseRef };
1082
1656
  },
@@ -1101,16 +1675,20 @@ function gitWorktreeAdapter(opts) {
1101
1675
  }
1102
1676
  function resolveWorktreePath(surface, worktreeDir) {
1103
1677
  if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
1104
- if (worktreeDir) return join3(worktreeDir, basename(surface.worktreeRef));
1678
+ if (worktreeDir) return join5(worktreeDir, basename(surface.worktreeRef));
1105
1679
  return surface.worktreeRef;
1106
1680
  }
1107
1681
  export {
1682
+ DRIVER_GUIDE,
1108
1683
  FsLabeledScenarioStore,
1109
1684
  LabeledScenarioStoreError,
1110
1685
  ProfileMatrixError,
1111
1686
  SkillPatchParseError,
1112
1687
  WorktreeAdapterError,
1688
+ aceDriver,
1113
1689
  applySkillPatch,
1690
+ buildAnalystSurfaceDispatch,
1691
+ buildEvidenceVector,
1114
1692
  buildLoopProvenanceRecord,
1115
1693
  campaignBreakdown,
1116
1694
  campaignMeanComposite,
@@ -1124,19 +1702,25 @@ export {
1124
1702
  emitLoopProvenance,
1125
1703
  evolutionaryDriver,
1126
1704
  extractH2Sections,
1705
+ failureModeRecallJudge,
1127
1706
  fsCampaignStorage,
1128
1707
  gepaDriver,
1129
1708
  gepaParetoEntry,
1130
1709
  gepaReflectionEntry,
1131
1710
  gitWorktreeAdapter,
1711
+ haloDriver,
1132
1712
  heldOutGate,
1133
1713
  heldoutSignificance,
1134
1714
  inMemoryCampaignStorage,
1135
1715
  isProposedCandidate,
1136
1716
  labelTrustRank,
1137
1717
  loopProvenanceSpans,
1718
+ makePlaybackDispatch,
1719
+ memoryCurationDriver,
1138
1720
  openAutoPr,
1139
1721
  pairHoldout,
1722
+ paretoPolicy,
1723
+ paretoSignificanceGate,
1140
1724
  parseSkillPatchResponse,
1141
1725
  patchEditCount,
1142
1726
  provenanceRecordPath,
@@ -1148,9 +1732,13 @@ export {
1148
1732
  runOptimization,
1149
1733
  runProfileMatrix,
1150
1734
  runSkillOpt,
1735
+ scoreUserStory,
1736
+ selectDriver,
1151
1737
  skillOptDriver,
1152
1738
  skillOptEntry,
1153
1739
  surfaceContentHash,
1154
- surfaceHash
1740
+ surfaceHash,
1741
+ traceAnalystDriver,
1742
+ userStoryScoreboard
1155
1743
  };
1156
1744
  //# sourceMappingURL=index.js.map