@tangle-network/agent-eval 0.41.0 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/campaign/index.d.ts +48 -368
  2. package/dist/campaign/index.js +67 -1
  3. package/dist/campaign/index.js.map +1 -1
  4. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  5. package/dist/chunk-H4TOS272.js.map +1 -0
  6. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  7. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  8. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  9. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  10. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  11. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  12. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  13. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  14. package/dist/control.d.ts +2 -2
  15. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  16. package/dist/index.d.ts +227 -687
  17. package/dist/index.js +753 -1237
  18. package/dist/index.js.map +1 -1
  19. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  20. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +67 -3
  23. package/dist/pipelines/index.js.map +1 -1
  24. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  25. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  26. package/dist/reporting.d.ts +2 -3
  27. package/dist/reporting.js +4 -8
  28. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  29. package/dist/rl.d.ts +103 -221
  30. package/dist/rl.js +44 -199
  31. package/dist/rl.js.map +1 -1
  32. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  33. package/dist/traces.d.ts +3 -2
  34. package/dist/traces.js +5 -5
  35. package/dist/types-BLbRTxoc.d.ts +367 -0
  36. package/dist/wire/index.d.ts +1 -1
  37. package/package.json +1 -6
  38. package/dist/chunk-5U2DOJU4.js.map +0 -1
  39. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  40. package/dist/chunk-DMW5VENN.js +0 -1412
  41. package/dist/chunk-DMW5VENN.js.map +0 -1
  42. package/dist/chunk-EGIPWXHL.js.map +0 -1
  43. package/dist/chunk-MAZ26DC7.js +0 -99
  44. package/dist/chunk-MAZ26DC7.js.map +0 -1
  45. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  46. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  47. package/dist/optimization.d.ts +0 -11
  48. package/dist/optimization.js +0 -71
  49. package/dist/optimization.js.map +0 -1
  50. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  51. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  52. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
@@ -1,375 +1,13 @@
1
+ import { S as Scenario, C as CampaignResult, G as GateResult, M as Mutator, I as ImprovementDriver, a as Gate, L as LabeledScenarioStore, b as LabeledScenarioWrite, c as LabeledScenarioSampleArgs, d as LabeledScenarioRecord, D as DispatchFn, J as JudgeConfig, e as CampaignTraceWriter, f as MutableSurface, g as GenerationRecord, h as CodeSurface } from '../types-BLbRTxoc.js';
2
+ export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as DispatchContext, n as GateContext, o as GateDecision, p as GenerationCandidate, q as JudgeAggregate, r as JudgeDimension, s as JudgeScore, t as LabeledScenarioSource, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, u as ScenarioAggregate, v as SessionScript, T as TraceSpan } from '../types-BLbRTxoc.js';
3
+ import { L as LlmClientOptions } from '../llm-client-BXVRUZyX.js';
1
4
  import { RunRecord } from '@tangle-network/agent-runtime';
2
5
  import { R as RedTeamCase } from '../red-team-30II1T4o.js';
3
- import '../dataset-BlwAtYYf.js';
4
6
  import '../errors-mje_cKOs.js';
7
+ import '../raw-provider-sink-C46HDghv.js';
8
+ import '../dataset-BlwAtYYf.js';
5
9
  import '../store-Db2Bv8Cf.js';
6
10
 
7
- /**
8
- * @experimental
9
- *
10
- * Pass A substrate types — `runCampaign` is the one primitive every
11
- * eval flow composes from. Three contracts in this file:
12
- *
13
- * - `Scenario` input set
14
- * - `DispatchFn` how to run one scenario → artifact
15
- * - `CampaignResult` defined output schema (the contract downstream tools depend on)
16
- *
17
- * Three more lifted from earlier substrate work (re-exported):
18
- *
19
- * - `JudgeConfig` pluggable dimensional scorer (0.38)
20
- * - `Mutator` optimization-loop surface mutator
21
- * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)
22
- *
23
- * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
24
- * can build dashboards / CI gates / regression diffs against a stable schema.
25
- */
26
- /** @experimental Stable identifier + kind tag for any scenario. Consumers
27
- * extend with their per-domain payload (persona, task, requirement, ...). */
28
- interface Scenario {
29
- id: string;
30
- kind: string;
31
- tags?: string[];
32
- }
33
- /** @experimental Context handed to every dispatch invocation. Scoped — every
34
- * trace/span carries the cellId, every artifact write lands under the cell's
35
- * artifact root, the cost meter accumulates per cell. */
36
- interface DispatchContext {
37
- cellId: string;
38
- rep: number;
39
- generation?: number;
40
- seed: number;
41
- signal: AbortSignal;
42
- trace: CampaignTraceWriter;
43
- artifacts: CampaignArtifactWriter;
44
- cost: CampaignCostMeter;
45
- /** Populated when this run is part of a multi-cycle improvement loop. */
46
- cycleId?: string;
47
- /** Populated when the substrate resumed from a prior cache hit. */
48
- resumedFrom?: string;
49
- }
50
- /** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
51
- * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
52
- type DispatchFn<TScenario extends Scenario, TArtifact> = (scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
53
- /** @experimental One session within a multi-session journey. Dispatch is
54
- * invoked once per session in order; state from prior session's artifact
55
- * is exposed via `ctx.priorSessionArtifact`. */
56
- interface SessionScript<TScenario, TArtifact> {
57
- id: string;
58
- intent: string;
59
- maxTurns?: number;
60
- /** When true, knowledge accumulated this session persists to next. */
61
- affectsKnowledge?: boolean;
62
- /** Optional per-session persona evolution — called after the session
63
- * resolves. Returns the persona shape used by the NEXT session. */
64
- evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario;
65
- }
66
- interface JudgeDimension {
67
- /** JSON field name + score key. */
68
- key: string;
69
- /** Description shown in the judge's user prompt. */
70
- description: string;
71
- }
72
- /** @experimental Pluggable dimensional scorer. `score` is the contract:
73
- * given an artifact + scenario, return a `JudgeScore`. This is deliberately a
74
- * function, not a fixed LLM-prompt shape — real consumers judge with
75
- * ensembles, deterministic checks, or a single LLM call, and the substrate
76
- * must not constrain that. The `llmJudge()` helper builds a `score` that does
77
- * one LLM call for the common case. `appliesTo` lets a judge run only on
78
- * scenarios that match (e.g. a legal-citation judge only on legal scenarios). */
79
- interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
80
- name: string;
81
- dimensions: JudgeDimension[];
82
- /** Score one artifact. Throw on failure — a thrown judge is recorded as a
83
- * failed cell, never silently folded into a zero. */
84
- score(input: {
85
- artifact: TArtifact;
86
- scenario: TScenario;
87
- signal: AbortSignal;
88
- }): JudgeScore | Promise<JudgeScore>;
89
- appliesTo?: (scenario: TScenario) => boolean;
90
- }
91
- interface JudgeScore {
92
- dimensions: Record<string, number>;
93
- composite: number;
94
- notes: string;
95
- }
96
- /** @experimental A tier-4 code surface — a candidate change to the agent's
97
- * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +
98
- * trace findings → opens a worktree). Measured by checking out `worktreeRef`
99
- * and running the worker against the changed code. See the improvement-tier
100
- * table in `docs/design/loop-taxonomy.md`. */
101
- interface CodeSurface {
102
- kind: 'code';
103
- /** Worktree path or git ref holding the candidate code change. The
104
- * consumer's `dispatchWithSurface` checks this out before running. */
105
- worktreeRef: string;
106
- /** Base ref the change is measured against. Default: the repo's main. */
107
- baseRef?: string;
108
- /** Human summary of what changed — rendered into the auto-PR body. */
109
- summary?: string;
110
- }
111
- /** @experimental The mutable surface a driver proposes. Tiers (see
112
- * `docs/design/loop-taxonomy.md`):
113
- * - `string` — tiers 1-2: system-prompt addendum / serialized tool
114
- * config. Cheap, reversible, text-diffable.
115
- * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.
116
- * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
117
- * not this type. */
118
- type MutableSurface = string | CodeSurface;
119
- /** @experimental Stateless surface mutation — given findings + current
120
- * surface, return N candidate surfaces. Pure transform, no generation
121
- * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
122
- * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
123
- interface Mutator<TFindings = unknown> {
124
- kind: string;
125
- mutate(args: {
126
- findings: TFindings[];
127
- currentSurface: MutableSurface;
128
- populationSize: number;
129
- signal: AbortSignal;
130
- }): Promise<MutableSurface[]>;
131
- }
132
- /** @experimental Everything a driver's `propose()` may read to plan the next
133
- * batch of candidates. The first six fields are always present; the rest are
134
- * optional context the loop supplies when available, so cheap drivers
135
- * (`evolutionaryDriver`) can ignore them while a code-tier agentic generator
136
- * consumes the research report + dataset to drive a coding harness.
137
- * See `docs/design/self-improvement-engine.md`. */
138
- interface ProposeContext<TFindings = unknown> {
139
- currentSurface: MutableSurface;
140
- history: GenerationRecord[];
141
- findings: TFindings[];
142
- /** BREADTH: how many candidate surfaces to return this generation. */
143
- populationSize: number;
144
- generation: number;
145
- signal: AbortSignal;
146
- /** The Phase-2 research report (analyst findings + diff), produced AFTER the
147
- * trace analysts run. Opaque to the substrate — the driver that consumes it
148
- * types it. See the phase diagram in self-improvement-engine.md. */
149
- report?: unknown;
150
- /** Handle to all captured data — the driver samples traces / artifacts /
151
- * rewards here to ground its proposals. */
152
- dataset?: LabeledScenarioStore;
153
- /** DEPTH: max iterations the agentic generator may take per candidate.
154
- * 1 = single-shot; >1 = it may iterate on its own change before handing it
155
- * back to be measured. */
156
- maxImprovementShots?: number;
157
- }
158
- /** @experimental A surface-improvement strategy — the DRIVER of the
159
- * improvement loop. Given the current best surface, the history of what's
160
- * been tried + scored, and any external findings, propose the next batch of
161
- * candidate surfaces to measure. Optionally decide to stop early.
162
- *
163
- * The evolutionary mutator (`evolutionaryDriver`, here) and agent-runtime's
164
- * `improvementDriver` (with reflective / agentic generators) both conform —
165
- * drivers of the SAME loop, not separate loops. The loop body
166
- * (`runOptimization`) and the gated promotion shell (`runImprovementLoop`)
167
- * are driver-agnostic. */
168
- interface ImprovementDriver<TFindings = unknown> {
169
- kind: string;
170
- /** Plan: propose N candidate surfaces for the next generation. */
171
- propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
172
- /** Decide: stop early when the driver judges the search converged or
173
- * exhausted. Default (omitted) runs all `maxGenerations`. */
174
- decide?(args: {
175
- history: GenerationRecord[];
176
- }): {
177
- stop: boolean;
178
- reason?: string;
179
- };
180
- }
181
- interface OptimizerConfig {
182
- driver: ImprovementDriver;
183
- populationSize: number;
184
- maxGenerations: number;
185
- surfaceExtractor: (profile: unknown) => MutableSurface;
186
- }
187
- /** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */
188
- type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
189
- interface GateContext<TArtifact, TScenario extends Scenario> {
190
- candidateArtifacts: Map<string, TArtifact>;
191
- baselineArtifacts?: Map<string, TArtifact>;
192
- /** Candidate (winner) judge scores, keyed by cellId. */
193
- judgeScores: Map<string, Record<string, JudgeScore>>;
194
- /** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —
195
- * baseline + candidate share cellIds (same scenarios), so a single map
196
- * cannot represent both. A gate computing a holdout delta MUST read
197
- * candidate from `judgeScores` and baseline from here. */
198
- baselineJudgeScores?: Map<string, Record<string, JudgeScore>>;
199
- scenarios: TScenario[];
200
- cost: {
201
- candidate: number;
202
- baseline: number;
203
- };
204
- signal: AbortSignal;
205
- }
206
- interface GateResult {
207
- decision: GateDecision;
208
- reasons: string[];
209
- contributingGates: Array<{
210
- name: string;
211
- passed: boolean;
212
- detail: unknown;
213
- }>;
214
- delta?: number;
215
- }
216
- /** @experimental Composable promotion gate. */
217
- interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {
218
- name: string;
219
- decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>;
220
- }
221
- /** @experimental Scoped trace writer handed to each dispatch — every span
222
- * auto-tagged with the cellId so traces filter cleanly. */
223
- interface CampaignTraceWriter {
224
- span(name: string, attributes?: Record<string, unknown>): TraceSpan;
225
- flush(): Promise<void>;
226
- }
227
- interface TraceSpan {
228
- end(attributes?: Record<string, unknown>): void;
229
- setAttribute(key: string, value: unknown): void;
230
- }
231
- /** @experimental Scoped artifact writer — `write(path, content)` lands under
232
- * `<runDir>/<cellId>/<path>`. */
233
- interface CampaignArtifactWriter {
234
- write(path: string, content: string | Uint8Array): Promise<string>;
235
- writeJson(path: string, value: unknown): Promise<string>;
236
- }
237
- /** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
238
- * via the cost-ledger backend hooks; consumers can record additional
239
- * spend (sandbox time, tool costs) via `observe`. */
240
- interface CampaignCostMeter {
241
- observe(amountUsd: number, source: string): void;
242
- current(): number;
243
- }
244
- /** @experimental Source tag — required on every store write. Used by the
245
- * default training-source filter (production-trace samples NOT used as
246
- * training scenarios unless explicitly opted in). */
247
- type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
248
- type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
249
- /** @experimental Required-provenance write. The store rejects writes that
250
- * lack provenance — a default-on flywheel without provenance is the
251
- * data-poisoning vector flagged in the alignment review. */
252
- interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {
253
- scenario: TScenario;
254
- artifact: TArtifact;
255
- judgeScores: Record<string, JudgeScore>;
256
- source: LabeledScenarioSource;
257
- sourceVersionHash: string;
258
- capturedAt: string;
259
- redactionStatus: RedactionStatus;
260
- /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
261
- rateLimitBucket?: string;
262
- }
263
- interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown> extends LabeledScenarioWrite<TScenario, TArtifact> {
264
- /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */
265
- recordHash: string;
266
- /** Substrate-assigned split — train if captured before the campaign's
267
- * `temporalCutoff`, test if after. Explicit override allowed via filter. */
268
- split: 'train' | 'test';
269
- }
270
- interface LabeledScenarioSampleArgs {
271
- count: number;
272
- /** REQUIRED — substrate refuses to sample without an explicit split. */
273
- split: 'train' | 'test';
274
- /** REQUIRED — only records captured before this timestamp are returned.
275
- * Enforces temporal split discipline (test scenarios captured AFTER train
276
- * cannot enter the training pool). */
277
- capturedBefore: string;
278
- filter?: {
279
- kind?: string;
280
- source?: LabeledScenarioSource | LabeledScenarioSource[];
281
- minComposite?: number;
282
- maxComposite?: number;
283
- };
284
- }
285
- interface LabeledScenarioStore {
286
- observe(write: LabeledScenarioWrite): Promise<void>;
287
- sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
288
- size(): Promise<{
289
- train: number;
290
- test: number;
291
- bySource: Record<string, number>;
292
- }>;
293
- }
294
- interface CampaignCellResult<TArtifact> {
295
- cellId: string;
296
- scenarioId: string;
297
- rep: number;
298
- generation?: number;
299
- artifact: TArtifact;
300
- judgeScores: Record<string, JudgeScore>;
301
- costUsd: number;
302
- durationMs: number;
303
- seed: number;
304
- cached: boolean;
305
- error?: string;
306
- }
307
- interface JudgeAggregate {
308
- mean: number;
309
- stdev: number;
310
- ci95: [number, number];
311
- n: number;
312
- }
313
- interface ScenarioAggregate {
314
- meanComposite: number;
315
- ci95: [number, number];
316
- n: number;
317
- }
318
- interface GenerationRecord {
319
- generationIndex: number;
320
- candidates: GenerationCandidate[];
321
- promoted: string[];
322
- }
323
- /** One scored candidate surface in a generation. `dimensions` + `scenarios`
324
- * let a reflective `ImprovementDriver` ground its next proposal on WHICH
325
- * dimensions the candidate is weakest on and WHICH scenarios it best/worst
326
- * handled — the evidence a blind `Mutator` cannot see. */
327
- interface GenerationCandidate {
328
- surfaceHash: string;
329
- composite: number;
330
- ci95: [number, number];
331
- /** Mean score per judge dimension across all cells (scenarios × reps ×
332
- * judges that reported the dimension). */
333
- dimensions: Record<string, number>;
334
- /** Per-scenario composite (mean over reps + judges). */
335
- scenarios: Array<{
336
- scenarioId: string;
337
- composite: number;
338
- }>;
339
- }
340
- interface CampaignAggregates {
341
- byJudge: Record<string, JudgeAggregate>;
342
- byScenario: Record<string, ScenarioAggregate>;
343
- totalCostUsd: number;
344
- cellsExecuted: number;
345
- cellsSkipped: number;
346
- cellsCached: number;
347
- cellsFailed: number;
348
- }
349
- interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {
350
- /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */
351
- manifestHash: string;
352
- seed: number;
353
- startedAt: string;
354
- endedAt: string;
355
- durationMs: number;
356
- cells: Array<CampaignCellResult<TArtifact>>;
357
- aggregates: CampaignAggregates;
358
- optimization?: {
359
- generations: GenerationRecord[];
360
- winnerSurfaceHash?: string;
361
- };
362
- gate?: GateResult;
363
- prUrl?: string;
364
- runDir: string;
365
- artifactsByPath: Record<string, string>;
366
- /** Substrate strips the input scenarios to id+kind for the result manifest;
367
- * consumers needing full payload look it up via the original input. The
368
- * type parameter `TScenario` is propagated for downstream consumers that
369
- * want narrowed types when extending `CampaignResult`. */
370
- scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
371
- }
372
-
373
11
  /**
374
12
  * @experimental
375
13
  *
@@ -441,6 +79,48 @@ interface EvolutionaryDriverOptions<TFindings = unknown> {
441
79
  }
442
80
  declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
443
81
 
82
+ /**
83
+ * @experimental
84
+ *
85
+ * `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
86
+ * Each generation it reflects on the prior best candidate's per-scenario
87
+ * scores + weakest dimensions (the `GenerationCandidate` evidence from
88
+ * `runOptimization`), asks an LLM to propose targeted rewrites of the current
89
+ * surface, and returns them as the next population.
90
+ *
91
+ * This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
92
+ * ANY string surface in ANY consumer opts in by selecting it — system prompts,
93
+ * prompt addenda, judge/reviewer prompts, even a driver's own reflection
94
+ * prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
95
+ * `parseReflectionResponse`) and the router client; it has NO dependency on the
96
+ * legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
97
+ *
98
+ * It earns its keep where there is real per-instance signal (which the
99
+ * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
100
+ * now provide). For thin-signal surfaces it degrades to plain reflection — so
101
+ * it is a SELECTABLE driver, never a forced default. On generation 0 (no
102
+ * history) it reflects on the current surface against the mutation primitives
103
+ * alone.
104
+ */
105
+
106
+ interface GepaDriverOptions {
107
+ /** Router transport (apiKey/baseUrl). */
108
+ llm: LlmClientOptions;
109
+ /** Model that performs the reflection. */
110
+ model: string;
111
+ /** What is being optimized — appears in the reflection prompt for orientation. */
112
+ target: string;
113
+ /** Surface-specific mutation levers offered to the model. */
114
+ mutationPrimitives?: string[];
115
+ /** Top/bottom scenarios surfaced as evidence each generation. Default 3. */
116
+ evidenceK?: number;
117
+ /** Reflection sampling temperature. Default 0.7. */
118
+ temperature?: number;
119
+ /** Reflection max tokens. Default 6000. */
120
+ maxTokens?: number;
121
+ }
122
+ declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
123
+
444
124
  /**
445
125
  * @experimental
446
126
  *
@@ -796,4 +476,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
796
476
  * as a ref under the adapter's worktree dir. */
797
477
  declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
798
478
 
799
- export { type CampaignAggregates, type CampaignArtifactWriter, type CampaignCellResult, type CampaignCostMeter, type CampaignResult, type CampaignTraceWriter, type CodeSurface, type DefaultProductionGateOptions, type DispatchContext, type DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type Gate, type GateContext, type GateDecision, type GateResult, type GenerationCandidate, type GenerationRecord, type GitWorktreeAdapterOptions, type HeldOutGateOptions, type ImprovementDriver, type JudgeAggregate, type JudgeConfig, type JudgeDimension, type JudgeScore, type LabeledScenarioRecord, type LabeledScenarioSampleArgs, type LabeledScenarioSource, type LabeledScenarioStore, LabeledScenarioStoreError, type LabeledScenarioWrite, type MutableSurface, type Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type OptimizerConfig, type ProposeContext, type RedactionStatus, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, type Scenario, type ScenarioAggregate, type SessionScript, type TraceSpan, type Worktree, type WorktreeAdapter, WorktreeAdapterError, composeGate, defaultProductionGate, evolutionaryDriver, gitWorktreeAdapter, heldOutGate, openAutoPr, resolveWorktreePath, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };
479
+ export { CampaignResult, CampaignTraceWriter, CodeSurface, type DefaultProductionGateOptions, DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, Gate, GateResult, GenerationRecord, type GepaDriverOptions, type GitWorktreeAdapterOptions, type HeldOutGateOptions, ImprovementDriver, JudgeConfig, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, Scenario, type Worktree, type WorktreeAdapter, WorktreeAdapterError, composeGate, defaultProductionGate, evolutionaryDriver, gepaDriver, gitWorktreeAdapter, heldOutGate, openAutoPr, resolveWorktreePath, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };
@@ -2,14 +2,20 @@ import {
2
2
  runCampaign
3
3
  } from "../chunk-YNMCYUWT.js";
4
4
  import {
5
+ buildReflectionPrompt,
6
+ parseReflectionResponse,
5
7
  runCanaries,
6
8
  scoreRedTeamOutput
7
- } from "../chunk-5U2DOJU4.js";
9
+ } from "../chunk-N4SBKEPJ.js";
8
10
  import {
9
11
  detectRewardHacking
10
12
  } from "../chunk-YV7J7X5N.js";
11
13
  import "../chunk-WP7SY7AI.js";
12
14
  import "../chunk-GGE4NNQT.js";
15
+ import {
16
+ callLlm
17
+ } from "../chunk-VXNVVBZO.js";
18
+ import "../chunk-PC4UYEBM.js";
13
19
  import "../chunk-QYJT52YW.js";
14
20
  import "../chunk-PZ5AY32C.js";
15
21
 
@@ -141,6 +147,65 @@ function evolutionaryDriver(opts) {
141
147
  };
142
148
  }
143
149
 
150
+ // src/campaign/drivers/gepa.ts
151
+ var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
152
+ function gepaDriver(opts) {
153
+ const evidenceK = opts.evidenceK ?? 3;
154
+ return {
155
+ kind: "gepa",
156
+ async propose(ctx) {
157
+ const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
158
+ const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
159
+ const userPrompt = buildReflectionPrompt({
160
+ target,
161
+ parentPayload: parent,
162
+ topTrials: top,
163
+ bottomTrials: bottom,
164
+ childCount: ctx.populationSize,
165
+ mutationPrimitives: opts.mutationPrimitives
166
+ });
167
+ const result = await callLlm(
168
+ {
169
+ model: opts.model,
170
+ messages: [
171
+ { role: "system", content: REFLECTION_SYSTEM },
172
+ { role: "user", content: userPrompt }
173
+ ],
174
+ jsonMode: true,
175
+ temperature: opts.temperature ?? 0.7,
176
+ maxTokens: opts.maxTokens ?? 6e3
177
+ },
178
+ opts.llm
179
+ );
180
+ const proposals = parseReflectionResponse(result.content, ctx.populationSize);
181
+ const out = [];
182
+ for (const proposal of proposals) {
183
+ const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
184
+ if (text && text !== parent && !out.includes(text)) out.push(text);
185
+ }
186
+ return out;
187
+ }
188
+ };
189
+ }
190
+ function buildEvidence(ctx, evidenceK, baseTarget) {
191
+ const last = ctx.history.at(-1);
192
+ if (!last || last.candidates.length === 0) {
193
+ return { top: [], bottom: [], target: baseTarget };
194
+ }
195
+ const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
196
+ if (!best) return { top: [], bottom: [], target: baseTarget };
197
+ const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
198
+ const toTrace = (s) => ({
199
+ id: s.scenarioId,
200
+ score: s.composite
201
+ });
202
+ const top = byScore.slice(0, evidenceK).map(toTrace);
203
+ const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
204
+ const weakest = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, 3).map(([dim, value]) => `${dim} (${value.toFixed(2)})`);
205
+ const target = weakest.length > 0 ? `${baseTarget} \u2014 weakest dimensions: ${weakest.join(", ")}` : baseTarget;
206
+ return { top, bottom, target };
207
+ }
208
+
144
209
  // src/campaign/gates/compose.ts
145
210
  function composeGate(...gates) {
146
211
  if (gates.length === 0) {
@@ -828,6 +893,7 @@ export {
828
893
  composeGate,
829
894
  defaultProductionGate,
830
895
  evolutionaryDriver,
896
+ gepaDriver,
831
897
  gitWorktreeAdapter,
832
898
  heldOutGate,
833
899
  openAutoPr,