@tangle-network/agent-eval 0.72.0 → 0.72.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +518 -9
- package/dist/campaign/index.js +672 -22
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
- package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +13 -7
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +353 -2496
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-SL55X4VN.js +0 -186
- package/dist/chunk-SL55X4VN.js.map +0 -1
- package/dist/chunk-UD6EF73X.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
package/dist/campaign/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import {
|
|
2
|
+
buildEvidenceVector,
|
|
2
3
|
composeGate,
|
|
3
4
|
defaultProductionGate,
|
|
4
5
|
detectScale,
|
|
@@ -6,13 +7,17 @@ import {
|
|
|
6
7
|
evolutionaryDriver,
|
|
7
8
|
heldoutSignificance,
|
|
8
9
|
pairHoldout,
|
|
10
|
+
paretoPolicy,
|
|
11
|
+
paretoSignificanceGate,
|
|
9
12
|
runEval
|
|
10
|
-
} from "../chunk-
|
|
13
|
+
} from "../chunk-XPILG2CA.js";
|
|
11
14
|
import {
|
|
12
15
|
agentProfileHash,
|
|
13
16
|
estimateCost,
|
|
14
|
-
|
|
15
|
-
|
|
17
|
+
extractProducedState,
|
|
18
|
+
isModelPriced,
|
|
19
|
+
verifyCompletion
|
|
20
|
+
} from "../chunk-LB2UOI5F.js";
|
|
16
21
|
import {
|
|
17
22
|
buildLoopProvenanceRecord,
|
|
18
23
|
campaignBreakdown,
|
|
@@ -29,11 +34,12 @@ import {
|
|
|
29
34
|
openAutoPr,
|
|
30
35
|
provenanceRecordPath,
|
|
31
36
|
provenanceSpansPath,
|
|
37
|
+
renderAnalystEvidence,
|
|
32
38
|
runImprovementLoop,
|
|
33
39
|
runOptimization,
|
|
34
40
|
surfaceContentHash,
|
|
35
41
|
surfaceHash
|
|
36
|
-
} from "../chunk-
|
|
42
|
+
} from "../chunk-JYE3WOTE.js";
|
|
37
43
|
import {
|
|
38
44
|
assertRealBackend,
|
|
39
45
|
fsCampaignStorage,
|
|
@@ -41,24 +47,441 @@ import {
|
|
|
41
47
|
runCampaign,
|
|
42
48
|
summarizeBackendIntegrity
|
|
43
49
|
} from "../chunk-ZPSKPT3V.js";
|
|
50
|
+
import {
|
|
51
|
+
AnalystRegistry,
|
|
52
|
+
DEFAULT_TRACE_ANALYST_KINDS,
|
|
53
|
+
createTraceAnalystKind
|
|
54
|
+
} from "../chunk-WYIHD6EB.js";
|
|
44
55
|
import "../chunk-YV7J7X5N.js";
|
|
45
56
|
import {
|
|
46
|
-
|
|
47
|
-
} from "../chunk-
|
|
57
|
+
callLlm
|
|
58
|
+
} from "../chunk-IHDHUN2X.js";
|
|
48
59
|
import {
|
|
49
60
|
pairedBootstrap
|
|
50
61
|
} from "../chunk-ITBRCT73.js";
|
|
51
62
|
import "../chunk-GGE4NNQT.js";
|
|
52
|
-
import "../chunk-VSMTAMNK.js";
|
|
53
63
|
import {
|
|
54
|
-
|
|
55
|
-
|
|
64
|
+
OtlpFileTraceStore,
|
|
65
|
+
analyzeTraces
|
|
66
|
+
} from "../chunk-VUINJM5M.js";
|
|
56
67
|
import "../chunk-PC4UYEBM.js";
|
|
68
|
+
import {
|
|
69
|
+
validateRunRecord
|
|
70
|
+
} from "../chunk-F3SRAAZO.js";
|
|
71
|
+
import "../chunk-VSMTAMNK.js";
|
|
57
72
|
import {
|
|
58
73
|
AgentEvalError
|
|
59
74
|
} from "../chunk-3BFEG2F6.js";
|
|
60
75
|
import "../chunk-PZ5AY32C.js";
|
|
61
76
|
|
|
77
|
+
// src/campaign/analyst-surface.ts
|
|
78
|
+
function surfaceToText(surface) {
|
|
79
|
+
if (typeof surface === "string") return surface;
|
|
80
|
+
throw new Error(
|
|
81
|
+
`buildAnalystSurfaceDispatch: the analyst surface must be a string actorDescription, got a ${surface.kind}-tier surface (${surface.worktreeRef}). The analyst prompt is prompt-tier.`
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
function buildAnalystSurfaceDispatch(opts) {
|
|
85
|
+
const analyze = opts.analyze ?? analyzeTraces;
|
|
86
|
+
return async (surface, scenario, _ctx) => {
|
|
87
|
+
const actorDescription = surfaceToText(surface);
|
|
88
|
+
const res = await analyze(
|
|
89
|
+
{ question: scenario.question },
|
|
90
|
+
{ ...opts.analystOptions, actorDescription, source: scenario.source }
|
|
91
|
+
);
|
|
92
|
+
return {
|
|
93
|
+
answer: res.answer,
|
|
94
|
+
findings: res.findings,
|
|
95
|
+
actorPromptVersion: res.actorPromptVersion
|
|
96
|
+
};
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
function failureModeRecallJudge(opts = {}) {
|
|
100
|
+
const recallWeight = opts.recallWeight ?? 0.5;
|
|
101
|
+
return {
|
|
102
|
+
name: "failure-mode-recall",
|
|
103
|
+
dimensions: [
|
|
104
|
+
{ key: "recall", description: "fraction of ground-truth failure modes the analyst surfaced" },
|
|
105
|
+
{
|
|
106
|
+
key: "precision",
|
|
107
|
+
description: "1 \u2212 share of findings that named a failure/tool/error absent from this corpus"
|
|
108
|
+
}
|
|
109
|
+
],
|
|
110
|
+
appliesTo: (s) => s.kind === "analyst-surface",
|
|
111
|
+
score({ artifact, scenario }) {
|
|
112
|
+
const modes = scenario.expectedFailureModes;
|
|
113
|
+
if (modes.length === 0) {
|
|
114
|
+
throw new Error(
|
|
115
|
+
`failureModeRecallJudge: scenario '${scenario.id}' has no expectedFailureModes \u2014 refusing to score (a vacuous 1.0 would corrupt the comparison)`
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
const hay = artifact.findings.join("\n").toLowerCase();
|
|
119
|
+
const matched = modes.filter((m) => m.cues.some((c) => hay.includes(c.toLowerCase())));
|
|
120
|
+
const recall = matched.length / modes.length;
|
|
121
|
+
const forbidden = (scenario.forbiddenCues ?? []).map((c) => c.toLowerCase());
|
|
122
|
+
let precision = 1;
|
|
123
|
+
let hallucinated = 0;
|
|
124
|
+
if (forbidden.length > 0) {
|
|
125
|
+
const denom = Math.max(1, artifact.findings.length);
|
|
126
|
+
hallucinated = artifact.findings.filter(
|
|
127
|
+
(f) => forbidden.some((c) => f.toLowerCase().includes(c))
|
|
128
|
+
).length;
|
|
129
|
+
precision = 1 - hallucinated / denom;
|
|
130
|
+
}
|
|
131
|
+
const composite = forbidden.length > 0 ? recallWeight * recall + (1 - recallWeight) * precision : recall;
|
|
132
|
+
const missed = modes.filter((m) => !matched.includes(m)).map((m) => m.id);
|
|
133
|
+
const notes = `matched ${matched.length}/${modes.length} failure modes` + (missed.length ? `; missed [${missed.join(", ")}]` : "") + (hallucinated ? `; ${hallucinated} out-of-corpus finding(s)` : "");
|
|
134
|
+
return { dimensions: { recall, precision }, composite, notes };
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// src/campaign/drivers/_findings-text.ts
|
|
140
|
+
function findingToLesson(f) {
|
|
141
|
+
if (typeof f === "string") return f.trim() || null;
|
|
142
|
+
if (f && typeof f === "object") {
|
|
143
|
+
const o = f;
|
|
144
|
+
const cand = o.recommended_action ?? o.claim ?? o.lesson ?? o.text ?? o.message;
|
|
145
|
+
if (typeof cand === "string" && cand.trim()) return cand.trim();
|
|
146
|
+
}
|
|
147
|
+
return null;
|
|
148
|
+
}
|
|
149
|
+
function normKey(s) {
|
|
150
|
+
return s.toLowerCase().replace(/\s+/g, " ").replace(/[.;:!?\s]+$/, "").trim();
|
|
151
|
+
}
|
|
152
|
+
function surfaceToText2(surface) {
|
|
153
|
+
if (typeof surface === "string") return surface;
|
|
154
|
+
throw new Error(
|
|
155
|
+
`curator driver: surface must be a string prompt, got a ${surface.kind}-tier surface (${surface.worktreeRef}) \u2014 curation is prompt-tier`
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// src/campaign/drivers/ace.ts
|
|
160
|
+
var BLOCK_START = "<!-- BEGIN ace-playbook (auto-managed by aceDriver) -->";
|
|
161
|
+
var BLOCK_END = "<!-- END ace-playbook -->";
|
|
162
|
+
var DEFAULT_HEADING = "## Playbook (accumulated lessons \u2014 append-only)";
|
|
163
|
+
function parsePlaybook(surface) {
|
|
164
|
+
const start = surface.indexOf(BLOCK_START);
|
|
165
|
+
const end = surface.indexOf(BLOCK_END);
|
|
166
|
+
if (start === -1 || end === -1 || end < start) return [];
|
|
167
|
+
const body = surface.slice(start + BLOCK_START.length, end);
|
|
168
|
+
const out = [];
|
|
169
|
+
for (const raw of body.split("\n")) {
|
|
170
|
+
const line = raw.trim();
|
|
171
|
+
if (!line.startsWith("- ")) continue;
|
|
172
|
+
const item = line.slice(2).trim();
|
|
173
|
+
const tag = /^\[g(-?\d+)\]\s*(.*)$/.exec(item);
|
|
174
|
+
if (tag) out.push({ gen: Number(tag[1]), text: tag[2].trim() });
|
|
175
|
+
else out.push({ gen: -1, text: item });
|
|
176
|
+
}
|
|
177
|
+
return out;
|
|
178
|
+
}
|
|
179
|
+
function stripBlock(surface) {
|
|
180
|
+
const start = surface.indexOf(BLOCK_START);
|
|
181
|
+
const end = surface.indexOf(BLOCK_END);
|
|
182
|
+
if (start === -1 || end === -1 || end < start) return surface.trimEnd();
|
|
183
|
+
return (surface.slice(0, start) + surface.slice(end + BLOCK_END.length)).trimEnd();
|
|
184
|
+
}
|
|
185
|
+
function aceDriver(opts = {}) {
|
|
186
|
+
const maxEntries = opts.maxEntries ?? 50;
|
|
187
|
+
if (maxEntries < 1) throw new Error("aceDriver: maxEntries must be >= 1");
|
|
188
|
+
const heading = opts.sectionHeading ?? DEFAULT_HEADING;
|
|
189
|
+
return {
|
|
190
|
+
kind: "ace",
|
|
191
|
+
async propose(ctx) {
|
|
192
|
+
const parent = surfaceToText2(ctx.currentSurface);
|
|
193
|
+
const existing = parsePlaybook(parent);
|
|
194
|
+
const seen = new Set(existing.map((b) => normKey(b.text)));
|
|
195
|
+
const fresh = [];
|
|
196
|
+
for (const f of ctx.findings ?? []) {
|
|
197
|
+
const lesson = findingToLesson(f);
|
|
198
|
+
if (!lesson) continue;
|
|
199
|
+
const k = normKey(lesson);
|
|
200
|
+
if (!k || seen.has(k)) continue;
|
|
201
|
+
seen.add(k);
|
|
202
|
+
fresh.push({ gen: ctx.generation, text: lesson });
|
|
203
|
+
}
|
|
204
|
+
if (fresh.length === 0) return [];
|
|
205
|
+
const all = [...existing, ...fresh].slice(-maxEntries);
|
|
206
|
+
const block = [
|
|
207
|
+
BLOCK_START,
|
|
208
|
+
heading,
|
|
209
|
+
...all.map((b) => `- [g${b.gen}] ${b.text}`),
|
|
210
|
+
BLOCK_END
|
|
211
|
+
].join("\n");
|
|
212
|
+
const base = stripBlock(parent);
|
|
213
|
+
const surface = base ? `${base}
|
|
214
|
+
|
|
215
|
+
${block}` : block;
|
|
216
|
+
return [
|
|
217
|
+
{
|
|
218
|
+
surface,
|
|
219
|
+
label: `ace-playbook +${fresh.length}`,
|
|
220
|
+
rationale: `appended ${fresh.length} new lesson(s) from gen ${ctx.generation} findings (playbook now ${all.length} bullet(s), append-only)`
|
|
221
|
+
}
|
|
222
|
+
];
|
|
223
|
+
}
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// src/campaign/drivers/guide.ts
|
|
228
|
+
var DRIVER_GUIDE = {
|
|
229
|
+
gepa: {
|
|
230
|
+
summary: "Reflective full-surface rewrite: reflects on the best parent\u2019s weakest dimensions + per-scenario scores, proposes targeted rewrites, maintains a Pareto frontier across generations.",
|
|
231
|
+
surface: "prompt",
|
|
232
|
+
strategy: "reflective-rewrite",
|
|
233
|
+
whenUse: "The default for a prompt/instruction surface with headroom \u2014 broad rewrites plus Pareto-optimal exploration across scenarios.",
|
|
234
|
+
cost: "medium"
|
|
235
|
+
},
|
|
236
|
+
skillOpt: {
|
|
237
|
+
summary: "Patch-mode: bounded, anchored add/delete/replace edits to ONE skill document, so a good rule introduced earlier is not clobbered by a later sweeping rewrite.",
|
|
238
|
+
surface: "skill-doc",
|
|
239
|
+
strategy: "anchored-patch",
|
|
240
|
+
whenUse: 'Refining a skill document incrementally where accumulated rules must be preserved; the edit budget is the "textual learning rate".',
|
|
241
|
+
cost: "medium"
|
|
242
|
+
},
|
|
243
|
+
ace: {
|
|
244
|
+
summary: "Append-mostly playbook curator: grows the playbook with provenance-tagged delta bullets, never merging \u2014 guards against context collapse.",
|
|
245
|
+
surface: "playbook",
|
|
246
|
+
strategy: "append-only",
|
|
247
|
+
whenUse: "Accumulating many specific, hard-won lessons over time where dedup/rewrite would summarize away detail.",
|
|
248
|
+
cost: "low"
|
|
249
|
+
},
|
|
250
|
+
memoryCuration: {
|
|
251
|
+
summary: "Dedup-and-rank curator: builds a compact searchable memory and grafts the most relevant, most-recurrent lessons onto the surface.",
|
|
252
|
+
surface: "memory",
|
|
253
|
+
strategy: "dedup-curate",
|
|
254
|
+
whenUse: "Accumulating lessons while keeping the surface compact \u2014 the complement to ace when context size matters more than verbatim provenance.",
|
|
255
|
+
cost: "low"
|
|
256
|
+
},
|
|
257
|
+
halo: {
|
|
258
|
+
summary: "Wraps the real external HALO engine (Inference.net, `halo` CLI) and applies its findings to the prompt via one LLM edit.",
|
|
259
|
+
surface: "prompt",
|
|
260
|
+
strategy: "analysis-edit",
|
|
261
|
+
whenUse: "Benchmarking: compete HALO head-to-head against our own analysis on identical traces via compareDrivers.",
|
|
262
|
+
cost: "high",
|
|
263
|
+
external: true
|
|
264
|
+
},
|
|
265
|
+
traceAnalyst: {
|
|
266
|
+
summary: "Wraps agent-eval\u2019s own trace-analyst engine and applies its findings to the prompt via one identical LLM edit \u2014 the symmetric opponent to haloDriver.",
|
|
267
|
+
surface: "prompt",
|
|
268
|
+
strategy: "analysis-edit",
|
|
269
|
+
whenUse: "Benchmarking our trace-analyst\u2019s analysis quality against HALO (analysis-quality head-to-head), or improving from a real OTLP trace corpus.",
|
|
270
|
+
cost: "high"
|
|
271
|
+
},
|
|
272
|
+
evolutionary: {
|
|
273
|
+
summary: "Adapts a stateless Mutator (population mutate \u2192 measure \u2192 select); no generation memory beyond the current surface.",
|
|
274
|
+
surface: "any",
|
|
275
|
+
strategy: "population-mutate",
|
|
276
|
+
whenUse: "Blind population search when you have a Mutator and don\u2019t need reflective reasoning over findings.",
|
|
277
|
+
cost: "medium"
|
|
278
|
+
}
|
|
279
|
+
};
|
|
280
|
+
var GOAL_RANK = {
|
|
281
|
+
explore: ["gepa", "evolutionary"],
|
|
282
|
+
refine: ["skillOpt", "gepa"],
|
|
283
|
+
accumulate: ["ace", "memoryCuration"],
|
|
284
|
+
benchmark: ["traceAnalyst", "halo"]
|
|
285
|
+
};
|
|
286
|
+
function selectDriver(criteria) {
|
|
287
|
+
const ranked = GOAL_RANK[criteria.goal];
|
|
288
|
+
const out = [];
|
|
289
|
+
for (const name of ranked) {
|
|
290
|
+
const entry = DRIVER_GUIDE[name];
|
|
291
|
+
if (criteria.surface && criteria.surface !== "any" && entry.surface !== criteria.surface)
|
|
292
|
+
continue;
|
|
293
|
+
out.push({
|
|
294
|
+
name,
|
|
295
|
+
entry,
|
|
296
|
+
reason: `${criteria.goal}: ${entry.strategy} on the ${entry.surface} surface \u2014 ${entry.whenUse}`
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
if (out.length === 0 && criteria.surface) {
|
|
300
|
+
for (const name of Object.keys(DRIVER_GUIDE)) {
|
|
301
|
+
const entry = DRIVER_GUIDE[name];
|
|
302
|
+
if (entry.surface === criteria.surface || entry.surface === "any") {
|
|
303
|
+
out.push({ name, entry, reason: `surface match (${entry.surface}): ${entry.whenUse}` });
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
return out;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// src/campaign/drivers/halo.ts
|
|
311
|
+
import { execFile } from "child_process";
|
|
312
|
+
import { mkdtempSync, writeFileSync } from "fs";
|
|
313
|
+
import { tmpdir } from "os";
|
|
314
|
+
import { join } from "path";
|
|
315
|
+
import { promisify } from "util";
|
|
316
|
+
var execFileAsync = promisify(execFile);
|
|
317
|
+
var DEFAULT_ANALYSIS_PROMPT = "Diagnose the failures in these agent execution traces \u2014 hallucinated tool calls, redundant tool arguments, refusal loops, and semantic-correctness errors \u2014 and suggest concrete, generalizable fixes to the agent instructions.";
|
|
318
|
+
var APPLY_SYSTEM = "You apply a trace-analysis report to an agent instruction prompt. Output ONLY the full revised prompt \u2014 no preamble, no commentary, no code fences. Make the minimal edits that address the report findings; preserve everything else verbatim.";
|
|
319
|
+
function haloDriver(opts) {
|
|
320
|
+
const haloBin = opts.haloBin ?? "halo";
|
|
321
|
+
const model = opts.model ?? "gpt-5.4-mini";
|
|
322
|
+
return {
|
|
323
|
+
kind: "halo",
|
|
324
|
+
async propose(ctx) {
|
|
325
|
+
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
326
|
+
const traces = await opts.resolveTraces(ctx) ?? "";
|
|
327
|
+
if (!traces.trim()) {
|
|
328
|
+
throw new Error(
|
|
329
|
+
"haloDriver: resolveTraces returned no OTLP traces \u2014 the halo engine has nothing to analyze"
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
const dir = mkdtempSync(join(tmpdir(), "halo-driver-"));
|
|
333
|
+
const tracePath = join(dir, "traces.jsonl");
|
|
334
|
+
writeFileSync(tracePath, traces.endsWith("\n") ? traces : `${traces}
|
|
335
|
+
`);
|
|
336
|
+
const args = [
|
|
337
|
+
tracePath,
|
|
338
|
+
"-p",
|
|
339
|
+
opts.analysisPrompt ?? DEFAULT_ANALYSIS_PROMPT,
|
|
340
|
+
"-m",
|
|
341
|
+
model,
|
|
342
|
+
...opts.maxDepth !== void 0 ? ["--max-depth", String(opts.maxDepth)] : [],
|
|
343
|
+
...opts.maxTurns !== void 0 ? ["--max-turns", String(opts.maxTurns)] : []
|
|
344
|
+
];
|
|
345
|
+
let findings;
|
|
346
|
+
try {
|
|
347
|
+
const { stdout } = await execFileAsync(haloBin, args, {
|
|
348
|
+
maxBuffer: 64 * 1024 * 1024,
|
|
349
|
+
signal: ctx.signal,
|
|
350
|
+
env: {
|
|
351
|
+
...process.env,
|
|
352
|
+
...opts.apiKey ? { OPENAI_API_KEY: opts.apiKey } : {},
|
|
353
|
+
OPENAI_BASE_URL: opts.baseUrl
|
|
354
|
+
}
|
|
355
|
+
});
|
|
356
|
+
findings = stdout.trim();
|
|
357
|
+
} catch (e) {
|
|
358
|
+
throw new Error(
|
|
359
|
+
`haloDriver: halo-engine ('${haloBin}') failed \u2014 ${e instanceof Error ? e.message : String(e)}`
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
if (!findings) throw new Error("haloDriver: halo-engine produced no findings");
|
|
363
|
+
const applied = await callLlm(
|
|
364
|
+
{
|
|
365
|
+
model: opts.applyModel ?? model,
|
|
366
|
+
messages: [
|
|
367
|
+
{ role: "system", content: APPLY_SYSTEM },
|
|
368
|
+
{
|
|
369
|
+
role: "user",
|
|
370
|
+
content: `CURRENT PROMPT:
|
|
371
|
+
${parent}
|
|
372
|
+
|
|
373
|
+
HALO TRACE-ANALYSIS REPORT:
|
|
374
|
+
${findings}
|
|
375
|
+
|
|
376
|
+
Return the full revised prompt.`
|
|
377
|
+
}
|
|
378
|
+
]
|
|
379
|
+
},
|
|
380
|
+
{ baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl }
|
|
381
|
+
);
|
|
382
|
+
const text = applied.content.trim();
|
|
383
|
+
if (!text || text === parent) return [];
|
|
384
|
+
return [
|
|
385
|
+
{
|
|
386
|
+
surface: text,
|
|
387
|
+
label: "halo",
|
|
388
|
+
rationale: `halo-engine findings:
|
|
389
|
+
${findings.slice(0, 800)}`
|
|
390
|
+
}
|
|
391
|
+
];
|
|
392
|
+
}
|
|
393
|
+
};
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// src/campaign/drivers/memory.ts
|
|
397
|
+
var BLOCK_START2 = "<!-- BEGIN curated-memory (auto-managed by memoryCurationDriver) -->";
|
|
398
|
+
var BLOCK_END2 = "<!-- END curated-memory -->";
|
|
399
|
+
var DEFAULT_HEADING2 = "## Learned from prior runs (curated memory)";
|
|
400
|
+
var DISTILL_SYSTEM = 'You compress raw trace-analysis findings into crisp, generalizable agent guidance. Output ONLY a JSON array of strings, each one imperative lesson the agent should follow (e.g. "Always fetch a resource before mutating it"). No prose outside the JSON. Deduplicate; keep the most actionable and general; drop case-specific noise.';
|
|
401
|
+
function extractExistingLessons(text) {
|
|
402
|
+
const start = text.indexOf(BLOCK_START2);
|
|
403
|
+
const end = text.indexOf(BLOCK_END2);
|
|
404
|
+
if (start === -1 || end === -1 || end < start) return [];
|
|
405
|
+
return text.slice(start + BLOCK_START2.length, end).split("\n").map((l) => l.replace(/^\s*-\s+/, "").trim()).filter((l) => l && !l.startsWith("#"));
|
|
406
|
+
}
|
|
407
|
+
function stripBlock2(text) {
|
|
408
|
+
const start = text.indexOf(BLOCK_START2);
|
|
409
|
+
const end = text.indexOf(BLOCK_END2);
|
|
410
|
+
if (start === -1 || end === -1 || end < start) return text.trimEnd();
|
|
411
|
+
return (text.slice(0, start) + text.slice(end + BLOCK_END2.length)).trimEnd();
|
|
412
|
+
}
|
|
413
|
+
async function distillLessons(raw, distill) {
|
|
414
|
+
const res = await callLlm(
|
|
415
|
+
{
|
|
416
|
+
model: distill.model,
|
|
417
|
+
messages: [
|
|
418
|
+
{ role: "system", content: DISTILL_SYSTEM },
|
|
419
|
+
{ role: "user", content: `Findings:
|
|
420
|
+
${raw.map((r) => `- ${r}`).join("\n")}` }
|
|
421
|
+
]
|
|
422
|
+
},
|
|
423
|
+
{ baseUrl: distill.baseUrl, apiKey: distill.apiKey, fetch: distill.fetchImpl }
|
|
424
|
+
);
|
|
425
|
+
try {
|
|
426
|
+
const parsed = JSON.parse(res.content.trim());
|
|
427
|
+
if (Array.isArray(parsed)) {
|
|
428
|
+
const lessons = parsed.filter(
|
|
429
|
+
(x) => typeof x === "string" && x.trim().length > 0
|
|
430
|
+
);
|
|
431
|
+
if (lessons.length > 0) return lessons;
|
|
432
|
+
}
|
|
433
|
+
} catch {
|
|
434
|
+
}
|
|
435
|
+
return raw;
|
|
436
|
+
}
|
|
437
|
+
function memoryCurationDriver(opts = {}) {
|
|
438
|
+
const maxEntries = opts.maxEntries ?? 12;
|
|
439
|
+
const heading = opts.sectionHeading ?? DEFAULT_HEADING2;
|
|
440
|
+
return {
|
|
441
|
+
kind: "memory-curation",
|
|
442
|
+
async propose(ctx) {
|
|
443
|
+
const parent = surfaceToText2(ctx.currentSurface);
|
|
444
|
+
const fresh = [];
|
|
445
|
+
for (const f of ctx.findings ?? []) {
|
|
446
|
+
const l = findingToLesson(f);
|
|
447
|
+
if (l) fresh.push(l);
|
|
448
|
+
}
|
|
449
|
+
const carried = extractExistingLessons(parent);
|
|
450
|
+
if (fresh.length === 0 && carried.length === 0) return [];
|
|
451
|
+
const distilled = opts.distill && fresh.length > 0 ? await distillLessons(fresh, opts.distill) : fresh;
|
|
452
|
+
const byKey = /* @__PURE__ */ new Map();
|
|
453
|
+
for (const l of carried) {
|
|
454
|
+
const k = normKey(l);
|
|
455
|
+
if (k) byKey.set(k, { text: l, count: 1 });
|
|
456
|
+
}
|
|
457
|
+
for (const l of distilled) {
|
|
458
|
+
const k = normKey(l);
|
|
459
|
+
if (!k) continue;
|
|
460
|
+
const e = byKey.get(k);
|
|
461
|
+
if (e) e.count += 1;
|
|
462
|
+
else byKey.set(k, { text: l, count: 1 });
|
|
463
|
+
}
|
|
464
|
+
const ranked = [...byKey.values()].sort((a, b) => b.count - a.count || a.text.localeCompare(b.text)).slice(0, maxEntries);
|
|
465
|
+
if (ranked.length === 0) return [];
|
|
466
|
+
const block = [BLOCK_START2, heading, ...ranked.map((e) => `- ${e.text}`), BLOCK_END2].join(
|
|
467
|
+
"\n"
|
|
468
|
+
);
|
|
469
|
+
const next = `${stripBlock2(parent)}
|
|
470
|
+
|
|
471
|
+
${block}
|
|
472
|
+
`;
|
|
473
|
+
if (next === parent) return [];
|
|
474
|
+
return [
|
|
475
|
+
{
|
|
476
|
+
surface: next,
|
|
477
|
+
label: "memory-curation",
|
|
478
|
+
rationale: `curated ${ranked.length} lessons (from ${fresh.length} new finding(s) + ${carried.length} carried)`
|
|
479
|
+
}
|
|
480
|
+
];
|
|
481
|
+
}
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
|
|
62
485
|
// src/campaign/skill-patch.ts
|
|
63
486
|
function applySkillPatch(surface, patch) {
|
|
64
487
|
let lines = surface.split("\n");
|
|
@@ -128,6 +551,7 @@ function skillOptDriver(opts) {
|
|
|
128
551
|
editBudget: args.editBudget,
|
|
129
552
|
rejectedBuffer: args.rejectedBuffer,
|
|
130
553
|
metaNote: args.metaNote,
|
|
554
|
+
findingsNote: args.findingsNote,
|
|
131
555
|
count: args.count
|
|
132
556
|
});
|
|
133
557
|
const result = await callLlm(
|
|
@@ -160,6 +584,7 @@ function skillOptDriver(opts) {
|
|
|
160
584
|
evidence: evidenceFromHistory(ctx, evidenceK),
|
|
161
585
|
editBudget: defaultBudget,
|
|
162
586
|
rejectedBuffer: [],
|
|
587
|
+
findingsNote: renderAnalystEvidence(ctx.findings, ctx.report) ?? void 0,
|
|
163
588
|
count: ctx.populationSize,
|
|
164
589
|
signal: ctx.signal
|
|
165
590
|
});
|
|
@@ -220,6 +645,9 @@ function buildPatchPrompt(args) {
|
|
|
220
645
|
...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
|
|
221
646
|
);
|
|
222
647
|
}
|
|
648
|
+
if (args.findingsNote) {
|
|
649
|
+
lines.push("", args.findingsNote);
|
|
650
|
+
}
|
|
223
651
|
if (args.metaNote) {
|
|
224
652
|
lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
|
|
225
653
|
}
|
|
@@ -292,10 +720,105 @@ function snippet(s, max = 120) {
|
|
|
292
720
|
return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
|
|
293
721
|
}
|
|
294
722
|
|
|
723
|
+
// src/campaign/drivers/trace-analyst.ts
|
|
724
|
+
import { mkdtempSync as mkdtempSync2, writeFileSync as writeFileSync2 } from "fs";
|
|
725
|
+
import { tmpdir as tmpdir2 } from "os";
|
|
726
|
+
import { join as join2 } from "path";
|
|
727
|
+
import { ai } from "@ax-llm/ax";
|
|
728
|
+
var APPLY_SYSTEM2 = "You apply a trace-analysis report to an agent instruction prompt. Output ONLY the full revised prompt \u2014 no preamble, no commentary, no code fences. Make the minimal edits that address the report findings; preserve everything else verbatim.";
|
|
729
|
+
function renderFindings(findings) {
|
|
730
|
+
return findings.map((f, i) => {
|
|
731
|
+
const action = f.recommended_action ? `
|
|
732
|
+
FIX: ${f.recommended_action}` : "";
|
|
733
|
+
const subject = f.subject ? ` (${f.subject})` : "";
|
|
734
|
+
return `${i + 1}. [${f.severity}/${f.area}]${subject} ${f.claim}${action}`;
|
|
735
|
+
}).join("\n");
|
|
736
|
+
}
|
|
737
|
+
function traceAnalystDriver(opts) {
|
|
738
|
+
if (!opts.apiKey) throw new Error("traceAnalystDriver: apiKey is required");
|
|
739
|
+
if (!opts.model) throw new Error("traceAnalystDriver: model is required");
|
|
740
|
+
const kinds = opts.kinds ?? DEFAULT_TRACE_ANALYST_KINDS;
|
|
741
|
+
return {
|
|
742
|
+
kind: "trace-analyst",
|
|
743
|
+
async propose(ctx) {
|
|
744
|
+
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
745
|
+
const traces = await opts.resolveTraces(ctx) ?? "";
|
|
746
|
+
if (!traces.trim()) {
|
|
747
|
+
throw new Error(
|
|
748
|
+
"traceAnalystDriver: resolveTraces returned no OTLP traces \u2014 the analyst has nothing to read"
|
|
749
|
+
);
|
|
750
|
+
}
|
|
751
|
+
const dir = mkdtempSync2(join2(tmpdir2(), "trace-analyst-driver-"));
|
|
752
|
+
const tracePath = join2(dir, "traces.jsonl");
|
|
753
|
+
writeFileSync2(tracePath, traces.endsWith("\n") ? traces : `${traces}
|
|
754
|
+
`);
|
|
755
|
+
const runAnalyze = opts.analyze ?? (async (path, c) => {
|
|
756
|
+
const aiService = ai({
|
|
757
|
+
name: opts.provider ?? "openai",
|
|
758
|
+
apiKey: opts.apiKey,
|
|
759
|
+
apiURL: opts.baseUrl,
|
|
760
|
+
config: { model: opts.model }
|
|
761
|
+
});
|
|
762
|
+
const registry = new AnalystRegistry();
|
|
763
|
+
for (const spec of kinds) {
|
|
764
|
+
registry.register(createTraceAnalystKind(spec, { ai: aiService, model: opts.model }));
|
|
765
|
+
}
|
|
766
|
+
const result = await registry.run(
|
|
767
|
+
`trace-analyst-gen-${c.generation}`,
|
|
768
|
+
{ traceStore: new OtlpFileTraceStore({ path }) },
|
|
769
|
+
{ signal: c.signal }
|
|
770
|
+
);
|
|
771
|
+
return result.findings;
|
|
772
|
+
});
|
|
773
|
+
let findings;
|
|
774
|
+
try {
|
|
775
|
+
findings = await runAnalyze(tracePath, ctx);
|
|
776
|
+
} catch (e) {
|
|
777
|
+
throw new Error(
|
|
778
|
+
`traceAnalystDriver: analyst engine failed \u2014 ${e instanceof Error ? e.message : String(e)}`
|
|
779
|
+
);
|
|
780
|
+
}
|
|
781
|
+
if (findings.length === 0) {
|
|
782
|
+
throw new Error("traceAnalystDriver: analyst engine produced no findings");
|
|
783
|
+
}
|
|
784
|
+
const report = renderFindings(findings);
|
|
785
|
+
const applied = await callLlm(
|
|
786
|
+
{
|
|
787
|
+
model: opts.applyModel ?? opts.model,
|
|
788
|
+
messages: [
|
|
789
|
+
{ role: "system", content: APPLY_SYSTEM2 },
|
|
790
|
+
{
|
|
791
|
+
role: "user",
|
|
792
|
+
content: `CURRENT PROMPT:
|
|
793
|
+
${parent}
|
|
794
|
+
|
|
795
|
+
TRACE-ANALYSIS REPORT:
|
|
796
|
+
${report}
|
|
797
|
+
|
|
798
|
+
Return the full revised prompt.`
|
|
799
|
+
}
|
|
800
|
+
]
|
|
801
|
+
},
|
|
802
|
+
{ baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl }
|
|
803
|
+
);
|
|
804
|
+
const text = applied.content.trim();
|
|
805
|
+
if (!text || text === parent) return [];
|
|
806
|
+
return [
|
|
807
|
+
{
|
|
808
|
+
surface: text,
|
|
809
|
+
label: "trace-analyst",
|
|
810
|
+
rationale: `trace-analyst findings (${findings.length}):
|
|
811
|
+
${report.slice(0, 800)}`
|
|
812
|
+
}
|
|
813
|
+
];
|
|
814
|
+
}
|
|
815
|
+
};
|
|
816
|
+
}
|
|
817
|
+
|
|
295
818
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
296
819
|
import { createHash } from "crypto";
|
|
297
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
298
|
-
import { join } from "path";
|
|
820
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync as writeFileSync3 } from "fs";
|
|
821
|
+
import { join as join3 } from "path";
|
|
299
822
|
var LabeledScenarioStoreError = class extends Error {
|
|
300
823
|
constructor(code, message) {
|
|
301
824
|
super(message);
|
|
@@ -455,7 +978,7 @@ var FsLabeledScenarioStore = class {
|
|
|
455
978
|
};
|
|
456
979
|
}
|
|
457
980
|
pathForSource(source) {
|
|
458
|
-
return
|
|
981
|
+
return join3(this.options.root, `${source}.jsonl`);
|
|
459
982
|
}
|
|
460
983
|
};
|
|
461
984
|
var ALL_SOURCES = [
|
|
@@ -497,9 +1020,9 @@ function sha256(input) {
|
|
|
497
1020
|
function appendLine(path, line) {
|
|
498
1021
|
if (existsSync(path)) {
|
|
499
1022
|
const existing = readFileSync(path, "utf8");
|
|
500
|
-
|
|
1023
|
+
writeFileSync3(path, existing + line);
|
|
501
1024
|
} else {
|
|
502
|
-
|
|
1025
|
+
writeFileSync3(path, line);
|
|
503
1026
|
}
|
|
504
1027
|
}
|
|
505
1028
|
|
|
@@ -800,7 +1323,13 @@ function gepaEntry(config, combineParents, name) {
|
|
|
800
1323
|
}),
|
|
801
1324
|
autoOnPromote: "none",
|
|
802
1325
|
runDir: `${config.runDir}/${slug(name)}-loop`,
|
|
803
|
-
...config.seed !== void 0 ? { seed: config.seed } : {}
|
|
1326
|
+
...config.seed !== void 0 ? { seed: config.seed } : {},
|
|
1327
|
+
// EYES→HANDS: flow findings to the driver's propose(). These reach
|
|
1328
|
+
// runOptimization unchanged (runImprovementLoop extends RunOptimizationOptions
|
|
1329
|
+
// and forwards {...opts}); ctx.findings/report/analyzeGeneration are consumed there.
|
|
1330
|
+
...config.findings !== void 0 ? { findings: config.findings } : {},
|
|
1331
|
+
...config.analyzeGeneration ? { analyzeGeneration: config.analyzeGeneration } : {},
|
|
1332
|
+
...config.report !== void 0 ? { report: config.report } : {}
|
|
804
1333
|
});
|
|
805
1334
|
const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
|
|
806
1335
|
(sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
|
|
@@ -836,9 +1365,114 @@ function skillOptEntry(config, name = "skill-opt") {
|
|
|
836
1365
|
};
|
|
837
1366
|
}
|
|
838
1367
|
|
|
1368
|
+
// src/campaign/presets/playback.ts
|
|
1369
|
+
function makePlaybackDispatch(driver) {
|
|
1370
|
+
return async (profile, scenario, ctx) => {
|
|
1371
|
+
const events = await driver.run(scenario, { ...ctx, profile });
|
|
1372
|
+
return extractProducedState(events);
|
|
1373
|
+
};
|
|
1374
|
+
}
|
|
1375
|
+
async function scoreUserStory(story, state, checkCorrectness) {
|
|
1376
|
+
const verdict = await verifyCompletion(
|
|
1377
|
+
{ taskId: story.id, requirements: story.requirements },
|
|
1378
|
+
state,
|
|
1379
|
+
checkCorrectness
|
|
1380
|
+
);
|
|
1381
|
+
return { ...verdict, title: story.title };
|
|
1382
|
+
}
|
|
1383
|
+
function userStoryScoreboard(verdicts) {
|
|
1384
|
+
const rows = [];
|
|
1385
|
+
for (const v of verdicts) {
|
|
1386
|
+
for (const r of v.requirements) {
|
|
1387
|
+
rows.push({
|
|
1388
|
+
storyId: v.taskId,
|
|
1389
|
+
storyTitle: v.title,
|
|
1390
|
+
reqId: r.reqId,
|
|
1391
|
+
reqTitle: r.title,
|
|
1392
|
+
status: r.satisfied ? "PASS" : "FAIL",
|
|
1393
|
+
evidence: r.evidence
|
|
1394
|
+
});
|
|
1395
|
+
}
|
|
1396
|
+
}
|
|
1397
|
+
return rows;
|
|
1398
|
+
}
|
|
1399
|
+
function scoreboardSummary(rows) {
|
|
1400
|
+
const byStory = /* @__PURE__ */ new Map();
|
|
1401
|
+
let passed = 0;
|
|
1402
|
+
for (const r of rows) {
|
|
1403
|
+
const s = byStory.get(r.storyId) ?? { total: 0, passed: 0 };
|
|
1404
|
+
s.total++;
|
|
1405
|
+
if (r.status === "PASS") {
|
|
1406
|
+
s.passed++;
|
|
1407
|
+
passed++;
|
|
1408
|
+
}
|
|
1409
|
+
byStory.set(r.storyId, s);
|
|
1410
|
+
}
|
|
1411
|
+
let storiesFullyComplete = 0;
|
|
1412
|
+
for (const s of byStory.values()) if (s.total > 0 && s.passed === s.total) storiesFullyComplete++;
|
|
1413
|
+
return {
|
|
1414
|
+
stories: byStory.size,
|
|
1415
|
+
storiesFullyComplete,
|
|
1416
|
+
requirements: rows.length,
|
|
1417
|
+
passed,
|
|
1418
|
+
failed: rows.length - passed,
|
|
1419
|
+
passRate: rows.length === 0 ? 0 : passed / rows.length
|
|
1420
|
+
};
|
|
1421
|
+
}
|
|
1422
|
+
function escapeCell(s) {
|
|
1423
|
+
return s.replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
1424
|
+
}
|
|
1425
|
+
function truncate2(s, max) {
|
|
1426
|
+
return s.length <= max ? s : `${s.slice(0, Math.max(0, max - 1))}\u2026`;
|
|
1427
|
+
}
|
|
1428
|
+
function renderScoreboardMarkdown(rows, opts = {}) {
|
|
1429
|
+
const maxEv = opts.maxEvidenceChars ?? 160;
|
|
1430
|
+
const sum = scoreboardSummary(rows);
|
|
1431
|
+
const pct = (n) => `${Math.round(n * 100)}%`;
|
|
1432
|
+
const ev = (e) => escapeCell(truncate2(e.join("; "), maxEv)) || "\u2014";
|
|
1433
|
+
const out = [`# ${opts.title ?? "Product-flow playback scoreboard"}`, ""];
|
|
1434
|
+
if (opts.meta) {
|
|
1435
|
+
for (const [k, v] of Object.entries(opts.meta)) out.push(`- **${k}:** ${v}`);
|
|
1436
|
+
out.push("");
|
|
1437
|
+
}
|
|
1438
|
+
out.push(
|
|
1439
|
+
`**${sum.storiesFullyComplete}/${sum.stories}** user stories fully shipped \xB7 **${sum.passed}/${sum.requirements}** requirements passing (${pct(sum.passRate)}) \xB7 **${sum.failed}** open`,
|
|
1440
|
+
""
|
|
1441
|
+
);
|
|
1442
|
+
const fails = rows.filter((r) => r.status === "FAIL");
|
|
1443
|
+
if (fails.length > 0) {
|
|
1444
|
+
out.push("## Open tickets", "", "| Story | Requirement | Evidence |", "| --- | --- | --- |");
|
|
1445
|
+
for (const r of fails) {
|
|
1446
|
+
out.push(`| ${escapeCell(r.storyTitle)} | ${escapeCell(r.reqTitle)} | ${ev(r.evidence)} |`);
|
|
1447
|
+
}
|
|
1448
|
+
out.push("");
|
|
1449
|
+
} else {
|
|
1450
|
+
out.push("_All requirements passing \u2014 no open tickets._", "");
|
|
1451
|
+
}
|
|
1452
|
+
out.push("## Per-story tick-off", "");
|
|
1453
|
+
for (const storyId of [...new Set(rows.map((r) => r.storyId))]) {
|
|
1454
|
+
const storyRows = rows.filter((r) => r.storyId === storyId);
|
|
1455
|
+
const passed = storyRows.filter((r) => r.status === "PASS").length;
|
|
1456
|
+
const mark = passed === storyRows.length ? "\u2705" : "\u26A0\uFE0F";
|
|
1457
|
+
out.push(
|
|
1458
|
+
`### ${escapeCell(storyRows[0].storyTitle)} \u2014 ${passed}/${storyRows.length} ${mark}`,
|
|
1459
|
+
"",
|
|
1460
|
+
"| Requirement | Status | Evidence |",
|
|
1461
|
+
"| --- | --- | --- |"
|
|
1462
|
+
);
|
|
1463
|
+
for (const r of storyRows) {
|
|
1464
|
+
out.push(
|
|
1465
|
+
`| ${escapeCell(r.reqTitle)} | ${r.status === "PASS" ? "\u2705 PASS" : "\u274C FAIL"} | ${ev(r.evidence)} |`
|
|
1466
|
+
);
|
|
1467
|
+
}
|
|
1468
|
+
out.push("");
|
|
1469
|
+
}
|
|
1470
|
+
return out.join("\n");
|
|
1471
|
+
}
|
|
1472
|
+
|
|
839
1473
|
// src/campaign/presets/run-profile-matrix.ts
|
|
840
1474
|
import { createHash as createHash2 } from "crypto";
|
|
841
|
-
import { join as
|
|
1475
|
+
import { join as join4 } from "path";
|
|
842
1476
|
var ProfileMatrixError = class extends AgentEvalError {
|
|
843
1477
|
constructor(message) {
|
|
844
1478
|
super("profile_matrix", message);
|
|
@@ -989,7 +1623,7 @@ async function runProfileMatrix(opts) {
|
|
|
989
1623
|
captureSource: opts.captureSource,
|
|
990
1624
|
storage: opts.storage,
|
|
991
1625
|
now: opts.now,
|
|
992
|
-
runDir:
|
|
1626
|
+
runDir: join4(opts.runDir, sanitize(profile.id))
|
|
993
1627
|
});
|
|
994
1628
|
const profileRecords = [];
|
|
995
1629
|
for (const cell of campaign.cells) {
|
|
@@ -1061,7 +1695,7 @@ function rollupByPersona(records, scenarios, personaOf) {
|
|
|
1061
1695
|
// src/campaign/worktree/index.ts
|
|
1062
1696
|
import { execFileSync } from "child_process";
|
|
1063
1697
|
import { existsSync as existsSync2 } from "fs";
|
|
1064
|
-
import { basename, isAbsolute, join as
|
|
1698
|
+
import { basename, isAbsolute, join as join5 } from "path";
|
|
1065
1699
|
var WorktreeAdapterError = class extends Error {
|
|
1066
1700
|
constructor(message, cause) {
|
|
1067
1701
|
super(message);
|
|
@@ -1083,13 +1717,13 @@ function slug2(label) {
|
|
|
1083
1717
|
}
|
|
1084
1718
|
function gitWorktreeAdapter(opts) {
|
|
1085
1719
|
const git = opts.git ?? defaultGit;
|
|
1086
|
-
const worktreeDir = opts.worktreeDir ??
|
|
1720
|
+
const worktreeDir = opts.worktreeDir ?? join5(opts.repoRoot, ".worktrees");
|
|
1087
1721
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
1088
1722
|
return {
|
|
1089
1723
|
async create({ baseRef, label }) {
|
|
1090
1724
|
const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
1091
1725
|
const branch = `${branchPrefix}/${id}`;
|
|
1092
|
-
const path =
|
|
1726
|
+
const path = join5(worktreeDir, id);
|
|
1093
1727
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
1094
1728
|
return { path, branch, baseRef };
|
|
1095
1729
|
},
|
|
@@ -1114,16 +1748,20 @@ function gitWorktreeAdapter(opts) {
|
|
|
1114
1748
|
}
|
|
1115
1749
|
function resolveWorktreePath(surface, worktreeDir) {
|
|
1116
1750
|
if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
|
|
1117
|
-
if (worktreeDir) return
|
|
1751
|
+
if (worktreeDir) return join5(worktreeDir, basename(surface.worktreeRef));
|
|
1118
1752
|
return surface.worktreeRef;
|
|
1119
1753
|
}
|
|
1120
1754
|
export {
|
|
1755
|
+
DRIVER_GUIDE,
|
|
1121
1756
|
FsLabeledScenarioStore,
|
|
1122
1757
|
LabeledScenarioStoreError,
|
|
1123
1758
|
ProfileMatrixError,
|
|
1124
1759
|
SkillPatchParseError,
|
|
1125
1760
|
WorktreeAdapterError,
|
|
1761
|
+
aceDriver,
|
|
1126
1762
|
applySkillPatch,
|
|
1763
|
+
buildAnalystSurfaceDispatch,
|
|
1764
|
+
buildEvidenceVector,
|
|
1127
1765
|
buildLoopProvenanceRecord,
|
|
1128
1766
|
campaignBreakdown,
|
|
1129
1767
|
campaignMeanComposite,
|
|
@@ -1137,23 +1775,30 @@ export {
|
|
|
1137
1775
|
emitLoopProvenance,
|
|
1138
1776
|
evolutionaryDriver,
|
|
1139
1777
|
extractH2Sections,
|
|
1778
|
+
failureModeRecallJudge,
|
|
1140
1779
|
fsCampaignStorage,
|
|
1141
1780
|
gepaDriver,
|
|
1142
1781
|
gepaParetoEntry,
|
|
1143
1782
|
gepaReflectionEntry,
|
|
1144
1783
|
gitWorktreeAdapter,
|
|
1784
|
+
haloDriver,
|
|
1145
1785
|
heldOutGate,
|
|
1146
1786
|
heldoutSignificance,
|
|
1147
1787
|
inMemoryCampaignStorage,
|
|
1148
1788
|
isProposedCandidate,
|
|
1149
1789
|
labelTrustRank,
|
|
1150
1790
|
loopProvenanceSpans,
|
|
1791
|
+
makePlaybackDispatch,
|
|
1792
|
+
memoryCurationDriver,
|
|
1151
1793
|
openAutoPr,
|
|
1152
1794
|
pairHoldout,
|
|
1795
|
+
paretoPolicy,
|
|
1796
|
+
paretoSignificanceGate,
|
|
1153
1797
|
parseSkillPatchResponse,
|
|
1154
1798
|
patchEditCount,
|
|
1155
1799
|
provenanceRecordPath,
|
|
1156
1800
|
provenanceSpansPath,
|
|
1801
|
+
renderScoreboardMarkdown,
|
|
1157
1802
|
resolveWorktreePath,
|
|
1158
1803
|
runCampaign,
|
|
1159
1804
|
runEval,
|
|
@@ -1161,9 +1806,14 @@ export {
|
|
|
1161
1806
|
runOptimization,
|
|
1162
1807
|
runProfileMatrix,
|
|
1163
1808
|
runSkillOpt,
|
|
1809
|
+
scoreUserStory,
|
|
1810
|
+
scoreboardSummary,
|
|
1811
|
+
selectDriver,
|
|
1164
1812
|
skillOptDriver,
|
|
1165
1813
|
skillOptEntry,
|
|
1166
1814
|
surfaceContentHash,
|
|
1167
|
-
surfaceHash
|
|
1815
|
+
surfaceHash,
|
|
1816
|
+
traceAnalystDriver,
|
|
1817
|
+
userStoryScoreboard
|
|
1168
1818
|
};
|
|
1169
1819
|
//# sourceMappingURL=index.js.map
|