@tangle-network/agent-eval 0.61.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +388 -11
- package/dist/campaign/index.js +597 -12
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
- package/dist/chunk-4ODZXQV2.js.map +1 -0
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
- package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +331 -128
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-7TPYV2ER.js";
|
|
4
4
|
import {
|
|
5
5
|
buildReflectionPrompt,
|
|
6
|
+
paretoFrontier,
|
|
6
7
|
parseReflectionResponse,
|
|
7
8
|
runCanaries,
|
|
8
|
-
scoreRedTeamOutput
|
|
9
|
+
scoreRedTeamOutput
|
|
10
|
+
} from "./chunk-4ODZXQV2.js";
|
|
11
|
+
import {
|
|
9
12
|
summarizeBackendIntegrity
|
|
10
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-E22YUOAL.js";
|
|
11
14
|
import {
|
|
12
15
|
detectRewardHacking
|
|
13
16
|
} from "./chunk-YV7J7X5N.js";
|
|
@@ -145,52 +148,120 @@ function evolutionaryDriver(opts) {
|
|
|
145
148
|
|
|
146
149
|
// src/campaign/drivers/gepa.ts
|
|
147
150
|
var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
|
|
151
|
+
var COMBINE_SYSTEM = 'You are an expert prompt engineer performing a GEPA "combine complementary lessons" merge. You are given several non-dominated versions of one surface; each is uniquely best on different scenarios. Produce ONE new version that keeps what makes each version strong on its winning scenarios and resolves conflicts in favor of the more general rule. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} with exactly one proposal whose `payload` is the FULL merged surface text. No prose outside the JSON.';
|
|
148
152
|
function gepaDriver(opts) {
|
|
149
153
|
const evidenceK = opts.evidenceK ?? 3;
|
|
154
|
+
const combineParents = opts.combineParents ?? true;
|
|
155
|
+
const combineMaxParents = opts.combineMaxParents ?? 4;
|
|
156
|
+
if (combineParents && combineMaxParents < 1) {
|
|
157
|
+
throw new Error("gepaDriver: combineMaxParents must be >= 1 when combineParents is enabled");
|
|
158
|
+
}
|
|
150
159
|
return {
|
|
151
160
|
kind: "gepa",
|
|
152
161
|
async propose(ctx) {
|
|
153
162
|
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
154
|
-
const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
|
|
155
|
-
const userPrompt = buildReflectionPrompt({
|
|
156
|
-
target,
|
|
157
|
-
parentPayload: parent,
|
|
158
|
-
topTrials: top,
|
|
159
|
-
bottomTrials: bottom,
|
|
160
|
-
childCount: ctx.populationSize,
|
|
161
|
-
mutationPrimitives: opts.mutationPrimitives
|
|
162
|
-
});
|
|
163
|
-
const result = await callLlm(
|
|
164
|
-
{
|
|
165
|
-
model: opts.model,
|
|
166
|
-
messages: [
|
|
167
|
-
{ role: "system", content: REFLECTION_SYSTEM },
|
|
168
|
-
{ role: "user", content: userPrompt }
|
|
169
|
-
],
|
|
170
|
-
jsonMode: true,
|
|
171
|
-
temperature: opts.temperature ?? 0.7,
|
|
172
|
-
maxTokens: opts.maxTokens ?? 6e3
|
|
173
|
-
},
|
|
174
|
-
opts.llm
|
|
175
|
-
);
|
|
176
|
-
const proposals = parseReflectionResponse(result.content, ctx.populationSize);
|
|
177
|
-
const out = [];
|
|
178
|
-
const seen = /* @__PURE__ */ new Set();
|
|
179
163
|
const constraints = opts.constraints;
|
|
180
164
|
const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
|
|
181
165
|
const maxEdits = constraints?.maxSentenceEdits;
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if (
|
|
166
|
+
const out = [];
|
|
167
|
+
const seen = /* @__PURE__ */ new Set();
|
|
168
|
+
const accept = (payload, label, rationale) => {
|
|
169
|
+
const text = typeof payload === "string" ? payload.trim() : "";
|
|
170
|
+
if (!text || text === parent || seen.has(text)) return;
|
|
171
|
+
if (preserveSections && !validatePreservedSections(text, preserveSections)) return;
|
|
172
|
+
if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) return;
|
|
187
173
|
seen.add(text);
|
|
188
|
-
out.push({ surface: text, label
|
|
174
|
+
out.push({ surface: text, label, rationale });
|
|
175
|
+
};
|
|
176
|
+
const stringParents = (combineParents ? ctx.paretoParents ?? [] : []).filter((p) => typeof p.surface === "string").sort((a, b) => b.composite - a.composite).slice(0, combineMaxParents);
|
|
177
|
+
if (stringParents.length > 1) {
|
|
178
|
+
const combinePrompt = buildCombinePrompt({
|
|
179
|
+
target: opts.target,
|
|
180
|
+
parents: stringParents,
|
|
181
|
+
evidenceK
|
|
182
|
+
});
|
|
183
|
+
const combineResult = await callLlm(
|
|
184
|
+
{
|
|
185
|
+
model: opts.model,
|
|
186
|
+
messages: [
|
|
187
|
+
{ role: "system", content: COMBINE_SYSTEM },
|
|
188
|
+
{ role: "user", content: combinePrompt }
|
|
189
|
+
],
|
|
190
|
+
jsonMode: true,
|
|
191
|
+
temperature: opts.temperature ?? 0.7,
|
|
192
|
+
maxTokens: opts.maxTokens ?? 6e3
|
|
193
|
+
},
|
|
194
|
+
opts.llm
|
|
195
|
+
);
|
|
196
|
+
const merged = parseReflectionResponse(combineResult.content, 1)[0];
|
|
197
|
+
if (merged) {
|
|
198
|
+
accept(
|
|
199
|
+
merged.payload,
|
|
200
|
+
merged.label || "pareto-combine",
|
|
201
|
+
merged.rationale || `combined ${stringParents.length} non-dominated parents (gen ${stringParents.map((p) => p.generation).join(",")})`
|
|
202
|
+
);
|
|
203
|
+
}
|
|
189
204
|
}
|
|
190
|
-
|
|
205
|
+
const reflectCount = Math.max(0, ctx.populationSize - out.length);
|
|
206
|
+
if (reflectCount > 0) {
|
|
207
|
+
const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
|
|
208
|
+
const userPrompt = buildReflectionPrompt({
|
|
209
|
+
target,
|
|
210
|
+
parentPayload: parent,
|
|
211
|
+
topTrials: top,
|
|
212
|
+
bottomTrials: bottom,
|
|
213
|
+
childCount: reflectCount,
|
|
214
|
+
mutationPrimitives: opts.mutationPrimitives
|
|
215
|
+
});
|
|
216
|
+
const result = await callLlm(
|
|
217
|
+
{
|
|
218
|
+
model: opts.model,
|
|
219
|
+
messages: [
|
|
220
|
+
{ role: "system", content: REFLECTION_SYSTEM },
|
|
221
|
+
{ role: "user", content: userPrompt }
|
|
222
|
+
],
|
|
223
|
+
jsonMode: true,
|
|
224
|
+
temperature: opts.temperature ?? 0.7,
|
|
225
|
+
maxTokens: opts.maxTokens ?? 6e3
|
|
226
|
+
},
|
|
227
|
+
opts.llm
|
|
228
|
+
);
|
|
229
|
+
for (const proposal of parseReflectionResponse(result.content, reflectCount)) {
|
|
230
|
+
accept(proposal.payload, proposal.label, proposal.rationale);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return out.slice(0, ctx.populationSize);
|
|
191
234
|
}
|
|
192
235
|
};
|
|
193
236
|
}
|
|
237
|
+
function buildCombinePrompt(args) {
|
|
238
|
+
const lines = [
|
|
239
|
+
`You are merging ${args.parents.length} versions of: ${args.target}.`,
|
|
240
|
+
"",
|
|
241
|
+
"Each version is on the Pareto frontier \u2014 none dominates the others; each",
|
|
242
|
+
"wins on different scenarios. Combine their complementary strengths into",
|
|
243
|
+
"ONE version. Below, each version lists the scenarios it scores highest on.",
|
|
244
|
+
""
|
|
245
|
+
];
|
|
246
|
+
args.parents.forEach((p, i) => {
|
|
247
|
+
const tag = String.fromCharCode(65 + i);
|
|
248
|
+
const best = Object.entries(p.objectives).sort((a, b) => b[1] - a[1]).slice(0, args.evidenceK).map(([id, score]) => `${id} (${score.toFixed(2)})`);
|
|
249
|
+
lines.push(
|
|
250
|
+
`### Version ${tag} (mean ${p.composite.toFixed(2)}; strongest on: ${best.join(", ") || "n/a"})`,
|
|
251
|
+
"```",
|
|
252
|
+
p.surface,
|
|
253
|
+
"```",
|
|
254
|
+
""
|
|
255
|
+
);
|
|
256
|
+
});
|
|
257
|
+
lines.push(
|
|
258
|
+
"Return ONE merged version that would score well on the union of every",
|
|
259
|
+
"version's winning scenarios. Keep each version's specific winning rule;",
|
|
260
|
+
"where two rules conflict, prefer the more general one and note the choice",
|
|
261
|
+
"in your rationale."
|
|
262
|
+
);
|
|
263
|
+
return lines.join("\n");
|
|
264
|
+
}
|
|
194
265
|
function extractH2Sections(text) {
|
|
195
266
|
const out = [];
|
|
196
267
|
for (const line of text.split("\n")) {
|
|
@@ -451,9 +522,45 @@ function labelTrustRank(trust) {
|
|
|
451
522
|
return LABEL_TRUST_RANK[trust ?? "unverified"];
|
|
452
523
|
}
|
|
453
524
|
|
|
454
|
-
// src/campaign/
|
|
455
|
-
|
|
456
|
-
|
|
525
|
+
// src/campaign/score-utils.ts
|
|
526
|
+
function campaignMeanComposite(campaign) {
|
|
527
|
+
const composites = [];
|
|
528
|
+
for (const cell of campaign.cells) {
|
|
529
|
+
const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
530
|
+
if (cellComposites.length > 0) {
|
|
531
|
+
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
535
|
+
}
|
|
536
|
+
function campaignBreakdown(campaign) {
|
|
537
|
+
const dimSums = {};
|
|
538
|
+
const dimCounts = {};
|
|
539
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
540
|
+
for (const cell of campaign.cells) {
|
|
541
|
+
const judgeScores = Object.values(cell.judgeScores);
|
|
542
|
+
if (judgeScores.length === 0) continue;
|
|
543
|
+
const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
|
|
544
|
+
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
545
|
+
arr.push(cellComposite);
|
|
546
|
+
byScenario.set(cell.scenarioId, arr);
|
|
547
|
+
for (const score of judgeScores) {
|
|
548
|
+
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
549
|
+
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
550
|
+
dimCounts[key] = (dimCounts[key] ?? 0) + 1;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
const dimensions = {};
|
|
555
|
+
for (const key of Object.keys(dimSums)) {
|
|
556
|
+
const count = dimCounts[key] ?? 0;
|
|
557
|
+
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
558
|
+
}
|
|
559
|
+
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
|
|
560
|
+
scenarioId,
|
|
561
|
+
composite: comps.reduce((a, b) => a + b, 0) / comps.length
|
|
562
|
+
}));
|
|
563
|
+
return { dimensions, scenarios };
|
|
457
564
|
}
|
|
458
565
|
|
|
459
566
|
// src/campaign/presets/run-optimization.ts
|
|
@@ -470,11 +577,15 @@ async function runOptimization(opts) {
|
|
|
470
577
|
let currentSurfaces = [opts.baselineSurface];
|
|
471
578
|
let winnerSurface = opts.baselineSurface;
|
|
472
579
|
let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
|
|
473
|
-
let winnerComposite =
|
|
580
|
+
let winnerComposite = campaignMeanComposite(baselineCampaign);
|
|
474
581
|
let winnerLabel;
|
|
475
582
|
let winnerRationale;
|
|
583
|
+
const scored = [
|
|
584
|
+
toParetoParent(opts.baselineSurface, winnerSurfaceHash, baselineCampaign, -1)
|
|
585
|
+
];
|
|
476
586
|
for (let gen = 0; gen < opts.maxGenerations; gen++) {
|
|
477
587
|
if (opts.driver.decide?.({ history }).stop) break;
|
|
588
|
+
const paretoParents = computeParetoFrontier(scored);
|
|
478
589
|
const proposed = await opts.driver.propose({
|
|
479
590
|
currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
|
|
480
591
|
history,
|
|
@@ -484,7 +595,8 @@ async function runOptimization(opts) {
|
|
|
484
595
|
signal: new AbortController().signal,
|
|
485
596
|
report: opts.report,
|
|
486
597
|
dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
|
|
487
|
-
maxImprovementShots: opts.maxImprovementShots
|
|
598
|
+
maxImprovementShots: opts.maxImprovementShots,
|
|
599
|
+
paretoParents
|
|
488
600
|
});
|
|
489
601
|
const candidates = proposed.map(
|
|
490
602
|
(p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
|
|
@@ -498,8 +610,11 @@ async function runOptimization(opts) {
|
|
|
498
610
|
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
499
611
|
runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
|
|
500
612
|
});
|
|
501
|
-
const composite =
|
|
613
|
+
const composite = campaignMeanComposite(campaign);
|
|
502
614
|
surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
|
|
615
|
+
scored.push(
|
|
616
|
+
toParetoParent(surface, hash, campaign, gen, label || void 0, rationale || void 0)
|
|
617
|
+
);
|
|
503
618
|
}
|
|
504
619
|
surfaceResults.sort((a, b) => b.composite - a.composite);
|
|
505
620
|
const promoted = surfaceResults.slice(0, promoteTopK);
|
|
@@ -515,7 +630,7 @@ async function runOptimization(opts) {
|
|
|
515
630
|
const record = {
|
|
516
631
|
generationIndex: gen,
|
|
517
632
|
candidates: surfaceResults.map((s) => {
|
|
518
|
-
const breakdown =
|
|
633
|
+
const breakdown = campaignBreakdown(s.campaign);
|
|
519
634
|
const candidate = {
|
|
520
635
|
surfaceHash: s.surfaceHash,
|
|
521
636
|
composite: s.composite,
|
|
@@ -545,8 +660,49 @@ async function runOptimization(opts) {
|
|
|
545
660
|
winnerSurfaceHash,
|
|
546
661
|
winnerLabel,
|
|
547
662
|
winnerRationale,
|
|
548
|
-
baselineCampaign
|
|
663
|
+
baselineCampaign,
|
|
664
|
+
paretoFrontier: computeParetoFrontier(scored)
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
function toParetoParent(surface, hash, campaign, generation, label, rationale) {
|
|
668
|
+
const objectives = {};
|
|
669
|
+
for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
|
|
670
|
+
objectives[scenarioId] = composite;
|
|
671
|
+
}
|
|
672
|
+
const parent = {
|
|
673
|
+
surface,
|
|
674
|
+
surfaceHash: hash,
|
|
675
|
+
objectives,
|
|
676
|
+
composite: campaignMeanComposite(campaign),
|
|
677
|
+
generation
|
|
549
678
|
};
|
|
679
|
+
if (label) parent.label = label;
|
|
680
|
+
if (rationale) parent.rationale = rationale;
|
|
681
|
+
return parent;
|
|
682
|
+
}
|
|
683
|
+
function computeParetoFrontier(scored) {
|
|
684
|
+
if (scored.length <= 1) return [...scored];
|
|
685
|
+
const ids = /* @__PURE__ */ new Set();
|
|
686
|
+
for (const p of scored) for (const id of Object.keys(p.objectives)) ids.add(id);
|
|
687
|
+
if (ids.size === 0) return [...scored];
|
|
688
|
+
const floor = {};
|
|
689
|
+
for (const id of ids) {
|
|
690
|
+
let min = Number.POSITIVE_INFINITY;
|
|
691
|
+
for (const p of scored) {
|
|
692
|
+
const v = p.objectives[id];
|
|
693
|
+
if (typeof v === "number" && Number.isFinite(v) && v < min) min = v;
|
|
694
|
+
}
|
|
695
|
+
floor[id] = Number.isFinite(min) ? min : 0;
|
|
696
|
+
}
|
|
697
|
+
const objectives = [...ids].map((id) => ({
|
|
698
|
+
name: id,
|
|
699
|
+
direction: "maximize",
|
|
700
|
+
value: (p) => {
|
|
701
|
+
const v = p.objectives[id];
|
|
702
|
+
return typeof v === "number" && Number.isFinite(v) ? v : floor[id] ?? 0;
|
|
703
|
+
}
|
|
704
|
+
}));
|
|
705
|
+
return paretoFrontier(scored, objectives).frontier;
|
|
550
706
|
}
|
|
551
707
|
function surfaceHash(surface) {
|
|
552
708
|
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
@@ -556,45 +712,6 @@ function surfaceHash(surface) {
|
|
|
556
712
|
});
|
|
557
713
|
return createHash("sha256").update(material).digest("hex").slice(0, 16);
|
|
558
714
|
}
|
|
559
|
-
function meanComposite2(campaign) {
|
|
560
|
-
const composites = [];
|
|
561
|
-
for (const cell of campaign.cells) {
|
|
562
|
-
const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
563
|
-
if (cellComposites.length > 0) {
|
|
564
|
-
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
568
|
-
}
|
|
569
|
-
function candidateBreakdown(campaign) {
|
|
570
|
-
const dimSums = {};
|
|
571
|
-
const dimCounts = {};
|
|
572
|
-
const byScenario = /* @__PURE__ */ new Map();
|
|
573
|
-
for (const cell of campaign.cells) {
|
|
574
|
-
const judgeScores = Object.values(cell.judgeScores);
|
|
575
|
-
if (judgeScores.length === 0) continue;
|
|
576
|
-
const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
|
|
577
|
-
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
578
|
-
arr.push(cellComposite);
|
|
579
|
-
byScenario.set(cell.scenarioId, arr);
|
|
580
|
-
for (const score of judgeScores) {
|
|
581
|
-
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
582
|
-
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
583
|
-
dimCounts[key] = (dimCounts[key] ?? 0) + 1;
|
|
584
|
-
}
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
const dimensions = {};
|
|
588
|
-
for (const key of Object.keys(dimSums)) {
|
|
589
|
-
const count = dimCounts[key] ?? 0;
|
|
590
|
-
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
591
|
-
}
|
|
592
|
-
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
|
|
593
|
-
scenarioId,
|
|
594
|
-
composite: comps.reduce((a, b) => a + b, 0) / comps.length
|
|
595
|
-
}));
|
|
596
|
-
return { dimensions, scenarios };
|
|
597
|
-
}
|
|
598
715
|
|
|
599
716
|
// src/campaign/presets/run-improvement-loop.ts
|
|
600
717
|
async function runImprovementLoop(opts) {
|
|
@@ -612,7 +729,7 @@ async function runImprovementLoop(opts) {
|
|
|
612
729
|
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
613
730
|
}
|
|
614
731
|
const optimization = await runOptimization(opts);
|
|
615
|
-
const { runCampaign: runCampaign2 } = await import("./run-campaign-
|
|
732
|
+
const { runCampaign: runCampaign2 } = await import("./run-campaign-5J3ED2UJ.js");
|
|
616
733
|
const baselineOnHoldout = await runCampaign2({
|
|
617
734
|
...opts,
|
|
618
735
|
scenarios: opts.holdoutScenarios,
|
|
@@ -687,6 +804,11 @@ ${fmt(winnerSurface)}`;
|
|
|
687
804
|
return lines.join("\n");
|
|
688
805
|
}
|
|
689
806
|
|
|
807
|
+
// src/campaign/presets/run-eval.ts
|
|
808
|
+
async function runEval(opts) {
|
|
809
|
+
return runCampaign(opts);
|
|
810
|
+
}
|
|
811
|
+
|
|
690
812
|
// src/campaign/provenance.ts
|
|
691
813
|
import { createHash as createHash2 } from "crypto";
|
|
692
814
|
import { join as join2 } from "path";
|
|
@@ -911,11 +1033,13 @@ export {
|
|
|
911
1033
|
heldOutGate,
|
|
912
1034
|
isProposedCandidate,
|
|
913
1035
|
labelTrustRank,
|
|
914
|
-
|
|
1036
|
+
campaignMeanComposite,
|
|
1037
|
+
campaignBreakdown,
|
|
915
1038
|
runOptimization,
|
|
916
1039
|
surfaceHash,
|
|
917
1040
|
runImprovementLoop,
|
|
918
1041
|
defaultRenderDiff,
|
|
1042
|
+
runEval,
|
|
919
1043
|
surfaceContentHash,
|
|
920
1044
|
buildLoopProvenanceRecord,
|
|
921
1045
|
loopProvenanceSpans,
|
|
@@ -923,4 +1047,4 @@ export {
|
|
|
923
1047
|
provenanceSpansPath,
|
|
924
1048
|
emitLoopProvenance
|
|
925
1049
|
};
|
|
926
|
-
//# sourceMappingURL=chunk-
|
|
1050
|
+
//# sourceMappingURL=chunk-Z7ZU7IYZ.js.map
|