@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,362 +1,13 @@
|
|
|
1
|
+
import { S as Scenario, C as CampaignResult, G as GateResult, M as Mutator, I as ImprovementDriver, a as Gate, L as LabeledScenarioStore, b as LabeledScenarioWrite, c as LabeledScenarioSampleArgs, d as LabeledScenarioRecord, D as DispatchFn, J as JudgeConfig, e as CampaignTraceWriter, f as MutableSurface, g as GenerationRecord, h as CodeSurface } from '../types-BLbRTxoc.js';
|
|
2
|
+
export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as DispatchContext, n as GateContext, o as GateDecision, p as GenerationCandidate, q as JudgeAggregate, r as JudgeDimension, s as JudgeScore, t as LabeledScenarioSource, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, u as ScenarioAggregate, v as SessionScript, T as TraceSpan } from '../types-BLbRTxoc.js';
|
|
3
|
+
import { L as LlmClientOptions } from '../llm-client-BXVRUZyX.js';
|
|
1
4
|
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
2
5
|
import { R as RedTeamCase } from '../red-team-30II1T4o.js';
|
|
3
|
-
import '../dataset-BlwAtYYf.js';
|
|
4
6
|
import '../errors-mje_cKOs.js';
|
|
7
|
+
import '../raw-provider-sink-C46HDghv.js';
|
|
8
|
+
import '../dataset-BlwAtYYf.js';
|
|
5
9
|
import '../store-Db2Bv8Cf.js';
|
|
6
10
|
|
|
7
|
-
/**
|
|
8
|
-
* @experimental
|
|
9
|
-
*
|
|
10
|
-
* Pass A substrate types — `runCampaign` is the one primitive every
|
|
11
|
-
* eval flow composes from. Three contracts in this file:
|
|
12
|
-
*
|
|
13
|
-
* - `Scenario` input set
|
|
14
|
-
* - `DispatchFn` how to run one scenario → artifact
|
|
15
|
-
* - `CampaignResult` defined output schema (the contract downstream tools depend on)
|
|
16
|
-
*
|
|
17
|
-
* Three more lifted from earlier substrate work (re-exported):
|
|
18
|
-
*
|
|
19
|
-
* - `JudgeConfig` pluggable dimensional scorer (0.38)
|
|
20
|
-
* - `Mutator` optimization-loop surface mutator
|
|
21
|
-
* - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)
|
|
22
|
-
*
|
|
23
|
-
* No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
|
|
24
|
-
* can build dashboards / CI gates / regression diffs against a stable schema.
|
|
25
|
-
*/
|
|
26
|
-
/** @experimental Stable identifier + kind tag for any scenario. Consumers
|
|
27
|
-
* extend with their per-domain payload (persona, task, requirement, ...). */
|
|
28
|
-
interface Scenario {
|
|
29
|
-
id: string;
|
|
30
|
-
kind: string;
|
|
31
|
-
tags?: string[];
|
|
32
|
-
}
|
|
33
|
-
/** @experimental Context handed to every dispatch invocation. Scoped — every
|
|
34
|
-
* trace/span carries the cellId, every artifact write lands under the cell's
|
|
35
|
-
* artifact root, the cost meter accumulates per cell. */
|
|
36
|
-
interface DispatchContext {
|
|
37
|
-
cellId: string;
|
|
38
|
-
rep: number;
|
|
39
|
-
generation?: number;
|
|
40
|
-
seed: number;
|
|
41
|
-
signal: AbortSignal;
|
|
42
|
-
trace: CampaignTraceWriter;
|
|
43
|
-
artifacts: CampaignArtifactWriter;
|
|
44
|
-
cost: CampaignCostMeter;
|
|
45
|
-
/** Populated when this run is part of a multi-cycle improvement loop. */
|
|
46
|
-
cycleId?: string;
|
|
47
|
-
/** Populated when the substrate resumed from a prior cache hit. */
|
|
48
|
-
resumedFrom?: string;
|
|
49
|
-
}
|
|
50
|
-
/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
|
|
51
|
-
* whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
|
|
52
|
-
type DispatchFn<TScenario extends Scenario, TArtifact> = (scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
53
|
-
/** @experimental One session within a multi-session journey. Dispatch is
|
|
54
|
-
* invoked once per session in order; state from prior session's artifact
|
|
55
|
-
* is exposed via `ctx.priorSessionArtifact`. */
|
|
56
|
-
interface SessionScript<TScenario, TArtifact> {
|
|
57
|
-
id: string;
|
|
58
|
-
intent: string;
|
|
59
|
-
maxTurns?: number;
|
|
60
|
-
/** When true, knowledge accumulated this session persists to next. */
|
|
61
|
-
affectsKnowledge?: boolean;
|
|
62
|
-
/** Optional per-session persona evolution — called after the session
|
|
63
|
-
* resolves. Returns the persona shape used by the NEXT session. */
|
|
64
|
-
evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario;
|
|
65
|
-
}
|
|
66
|
-
interface JudgeDimension {
|
|
67
|
-
/** JSON field name + score key. */
|
|
68
|
-
key: string;
|
|
69
|
-
/** Description shown in the judge's user prompt. */
|
|
70
|
-
description: string;
|
|
71
|
-
}
|
|
72
|
-
/** @experimental Pluggable dimensional scorer. `score` is the contract:
|
|
73
|
-
* given an artifact + scenario, return a `JudgeScore`. This is deliberately a
|
|
74
|
-
* function, not a fixed LLM-prompt shape — real consumers judge with
|
|
75
|
-
* ensembles, deterministic checks, or a single LLM call, and the substrate
|
|
76
|
-
* must not constrain that. The `llmJudge()` helper builds a `score` that does
|
|
77
|
-
* one LLM call for the common case. `appliesTo` lets a judge run only on
|
|
78
|
-
* scenarios that match (e.g. a legal-citation judge only on legal scenarios). */
|
|
79
|
-
interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
|
|
80
|
-
name: string;
|
|
81
|
-
dimensions: JudgeDimension[];
|
|
82
|
-
/** Score one artifact. Throw on failure — a thrown judge is recorded as a
|
|
83
|
-
* failed cell, never silently folded into a zero. */
|
|
84
|
-
score(input: {
|
|
85
|
-
artifact: TArtifact;
|
|
86
|
-
scenario: TScenario;
|
|
87
|
-
signal: AbortSignal;
|
|
88
|
-
}): JudgeScore | Promise<JudgeScore>;
|
|
89
|
-
appliesTo?: (scenario: TScenario) => boolean;
|
|
90
|
-
}
|
|
91
|
-
interface JudgeScore {
|
|
92
|
-
dimensions: Record<string, number>;
|
|
93
|
-
composite: number;
|
|
94
|
-
notes: string;
|
|
95
|
-
}
|
|
96
|
-
/** @experimental A tier-4 code surface — a candidate change to the agent's
|
|
97
|
-
* IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +
|
|
98
|
-
* trace findings → opens a worktree). Measured by checking out `worktreeRef`
|
|
99
|
-
* and running the worker against the changed code. See the improvement-tier
|
|
100
|
-
* table in `docs/design/loop-taxonomy.md`. */
|
|
101
|
-
interface CodeSurface {
|
|
102
|
-
kind: 'code';
|
|
103
|
-
/** Worktree path or git ref holding the candidate code change. The
|
|
104
|
-
* consumer's `dispatchWithSurface` checks this out before running. */
|
|
105
|
-
worktreeRef: string;
|
|
106
|
-
/** Base ref the change is measured against. Default: the repo's main. */
|
|
107
|
-
baseRef?: string;
|
|
108
|
-
/** Human summary of what changed — rendered into the auto-PR body. */
|
|
109
|
-
summary?: string;
|
|
110
|
-
}
|
|
111
|
-
/** @experimental The mutable surface a driver proposes. Tiers (see
|
|
112
|
-
* `docs/design/loop-taxonomy.md`):
|
|
113
|
-
* - `string` — tiers 1-2: system-prompt addendum / serialized tool
|
|
114
|
-
* config. Cheap, reversible, text-diffable.
|
|
115
|
-
* - `CodeSurface` — tier 4: an implementation change behind a worktree ref.
|
|
116
|
-
* Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
|
|
117
|
-
* not this type. */
|
|
118
|
-
type MutableSurface = string | CodeSurface;
|
|
119
|
-
/** @experimental Stateless surface mutation — given findings + current
|
|
120
|
-
* surface, return N candidate surfaces. Pure transform, no generation
|
|
121
|
-
* awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
|
|
122
|
-
* conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
|
|
123
|
-
interface Mutator<TFindings = unknown> {
|
|
124
|
-
kind: string;
|
|
125
|
-
mutate(args: {
|
|
126
|
-
findings: TFindings[];
|
|
127
|
-
currentSurface: MutableSurface;
|
|
128
|
-
populationSize: number;
|
|
129
|
-
signal: AbortSignal;
|
|
130
|
-
}): Promise<MutableSurface[]>;
|
|
131
|
-
}
|
|
132
|
-
/** @experimental Everything a driver's `propose()` may read to plan the next
|
|
133
|
-
* batch of candidates. The first six fields are always present; the rest are
|
|
134
|
-
* optional context the loop supplies when available, so cheap drivers
|
|
135
|
-
* (`evolutionaryDriver`) can ignore them while a code-tier agentic generator
|
|
136
|
-
* consumes the research report + dataset to drive a coding harness.
|
|
137
|
-
* See `docs/design/self-improvement-engine.md`. */
|
|
138
|
-
interface ProposeContext<TFindings = unknown> {
|
|
139
|
-
currentSurface: MutableSurface;
|
|
140
|
-
history: GenerationRecord[];
|
|
141
|
-
findings: TFindings[];
|
|
142
|
-
/** BREADTH: how many candidate surfaces to return this generation. */
|
|
143
|
-
populationSize: number;
|
|
144
|
-
generation: number;
|
|
145
|
-
signal: AbortSignal;
|
|
146
|
-
/** The Phase-2 research report (analyst findings + diff), produced AFTER the
|
|
147
|
-
* trace analysts run. Opaque to the substrate — the driver that consumes it
|
|
148
|
-
* types it. See the phase diagram in self-improvement-engine.md. */
|
|
149
|
-
report?: unknown;
|
|
150
|
-
/** Handle to all captured data — the driver samples traces / artifacts /
|
|
151
|
-
* rewards here to ground its proposals. */
|
|
152
|
-
dataset?: LabeledScenarioStore;
|
|
153
|
-
/** DEPTH: max iterations the agentic generator may take per candidate.
|
|
154
|
-
* 1 = single-shot; >1 = it may iterate on its own change before handing it
|
|
155
|
-
* back to be measured. */
|
|
156
|
-
maxImprovementShots?: number;
|
|
157
|
-
}
|
|
158
|
-
/** @experimental A surface-improvement strategy — the DRIVER of the
|
|
159
|
-
* improvement loop. Given the current best surface, the history of what's
|
|
160
|
-
* been tried + scored, and any external findings, propose the next batch of
|
|
161
|
-
* candidate surfaces to measure. Optionally decide to stop early.
|
|
162
|
-
*
|
|
163
|
-
* The evolutionary mutator (`evolutionaryDriver`, here) and agent-runtime's
|
|
164
|
-
* `improvementDriver` (with reflective / agentic generators) both conform —
|
|
165
|
-
* drivers of the SAME loop, not separate loops. The loop body
|
|
166
|
-
* (`runOptimization`) and the gated promotion shell (`runImprovementLoop`)
|
|
167
|
-
* are driver-agnostic. */
|
|
168
|
-
interface ImprovementDriver<TFindings = unknown> {
|
|
169
|
-
kind: string;
|
|
170
|
-
/** Plan: propose N candidate surfaces for the next generation. */
|
|
171
|
-
propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
|
|
172
|
-
/** Decide: stop early when the driver judges the search converged or
|
|
173
|
-
* exhausted. Default (omitted) runs all `maxGenerations`. */
|
|
174
|
-
decide?(args: {
|
|
175
|
-
history: GenerationRecord[];
|
|
176
|
-
}): {
|
|
177
|
-
stop: boolean;
|
|
178
|
-
reason?: string;
|
|
179
|
-
};
|
|
180
|
-
}
|
|
181
|
-
interface OptimizerConfig {
|
|
182
|
-
driver: ImprovementDriver;
|
|
183
|
-
populationSize: number;
|
|
184
|
-
maxGenerations: number;
|
|
185
|
-
surfaceExtractor: (profile: unknown) => MutableSurface;
|
|
186
|
-
}
|
|
187
|
-
/** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */
|
|
188
|
-
type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
|
|
189
|
-
interface GateContext<TArtifact, TScenario extends Scenario> {
|
|
190
|
-
candidateArtifacts: Map<string, TArtifact>;
|
|
191
|
-
baselineArtifacts?: Map<string, TArtifact>;
|
|
192
|
-
/** Candidate (winner) judge scores, keyed by cellId. */
|
|
193
|
-
judgeScores: Map<string, Record<string, JudgeScore>>;
|
|
194
|
-
/** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —
|
|
195
|
-
* baseline + candidate share cellIds (same scenarios), so a single map
|
|
196
|
-
* cannot represent both. A gate computing a holdout delta MUST read
|
|
197
|
-
* candidate from `judgeScores` and baseline from here. */
|
|
198
|
-
baselineJudgeScores?: Map<string, Record<string, JudgeScore>>;
|
|
199
|
-
scenarios: TScenario[];
|
|
200
|
-
cost: {
|
|
201
|
-
candidate: number;
|
|
202
|
-
baseline: number;
|
|
203
|
-
};
|
|
204
|
-
signal: AbortSignal;
|
|
205
|
-
}
|
|
206
|
-
interface GateResult {
|
|
207
|
-
decision: GateDecision;
|
|
208
|
-
reasons: string[];
|
|
209
|
-
contributingGates: Array<{
|
|
210
|
-
name: string;
|
|
211
|
-
passed: boolean;
|
|
212
|
-
detail: unknown;
|
|
213
|
-
}>;
|
|
214
|
-
delta?: number;
|
|
215
|
-
}
|
|
216
|
-
/** @experimental Composable promotion gate. */
|
|
217
|
-
interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {
|
|
218
|
-
name: string;
|
|
219
|
-
decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>;
|
|
220
|
-
}
|
|
221
|
-
/** @experimental Scoped trace writer handed to each dispatch — every span
|
|
222
|
-
* auto-tagged with the cellId so traces filter cleanly. */
|
|
223
|
-
interface CampaignTraceWriter {
|
|
224
|
-
span(name: string, attributes?: Record<string, unknown>): TraceSpan;
|
|
225
|
-
flush(): Promise<void>;
|
|
226
|
-
}
|
|
227
|
-
interface TraceSpan {
|
|
228
|
-
end(attributes?: Record<string, unknown>): void;
|
|
229
|
-
setAttribute(key: string, value: unknown): void;
|
|
230
|
-
}
|
|
231
|
-
/** @experimental Scoped artifact writer — `write(path, content)` lands under
|
|
232
|
-
* `<runDir>/<cellId>/<path>`. */
|
|
233
|
-
interface CampaignArtifactWriter {
|
|
234
|
-
write(path: string, content: string | Uint8Array): Promise<string>;
|
|
235
|
-
writeJson(path: string, value: unknown): Promise<string>;
|
|
236
|
-
}
|
|
237
|
-
/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
|
|
238
|
-
* via the cost-ledger backend hooks; consumers can record additional
|
|
239
|
-
* spend (sandbox time, tool costs) via `observe`. */
|
|
240
|
-
interface CampaignCostMeter {
|
|
241
|
-
observe(amountUsd: number, source: string): void;
|
|
242
|
-
current(): number;
|
|
243
|
-
}
|
|
244
|
-
/** @experimental Source tag — required on every store write. Used by the
|
|
245
|
-
* default training-source filter (production-trace samples NOT used as
|
|
246
|
-
* training scenarios unless explicitly opted in). */
|
|
247
|
-
type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
|
|
248
|
-
type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
|
|
249
|
-
/** @experimental Required-provenance write. The store rejects writes that
|
|
250
|
-
* lack provenance — a default-on flywheel without provenance is the
|
|
251
|
-
* data-poisoning vector flagged in the alignment review. */
|
|
252
|
-
interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {
|
|
253
|
-
scenario: TScenario;
|
|
254
|
-
artifact: TArtifact;
|
|
255
|
-
judgeScores: Record<string, JudgeScore>;
|
|
256
|
-
source: LabeledScenarioSource;
|
|
257
|
-
sourceVersionHash: string;
|
|
258
|
-
capturedAt: string;
|
|
259
|
-
redactionStatus: RedactionStatus;
|
|
260
|
-
/** Optional per-source rate-limit bucket key (e.g., the tenant id). */
|
|
261
|
-
rateLimitBucket?: string;
|
|
262
|
-
}
|
|
263
|
-
interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown> extends LabeledScenarioWrite<TScenario, TArtifact> {
|
|
264
|
-
/** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */
|
|
265
|
-
recordHash: string;
|
|
266
|
-
/** Substrate-assigned split — train if captured before the campaign's
|
|
267
|
-
* `temporalCutoff`, test if after. Explicit override allowed via filter. */
|
|
268
|
-
split: 'train' | 'test';
|
|
269
|
-
}
|
|
270
|
-
interface LabeledScenarioSampleArgs {
|
|
271
|
-
count: number;
|
|
272
|
-
/** REQUIRED — substrate refuses to sample without an explicit split. */
|
|
273
|
-
split: 'train' | 'test';
|
|
274
|
-
/** REQUIRED — only records captured before this timestamp are returned.
|
|
275
|
-
* Enforces temporal split discipline (test scenarios captured AFTER train
|
|
276
|
-
* cannot enter the training pool). */
|
|
277
|
-
capturedBefore: string;
|
|
278
|
-
filter?: {
|
|
279
|
-
kind?: string;
|
|
280
|
-
source?: LabeledScenarioSource | LabeledScenarioSource[];
|
|
281
|
-
minComposite?: number;
|
|
282
|
-
maxComposite?: number;
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
interface LabeledScenarioStore {
|
|
286
|
-
observe(write: LabeledScenarioWrite): Promise<void>;
|
|
287
|
-
sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
|
|
288
|
-
size(): Promise<{
|
|
289
|
-
train: number;
|
|
290
|
-
test: number;
|
|
291
|
-
bySource: Record<string, number>;
|
|
292
|
-
}>;
|
|
293
|
-
}
|
|
294
|
-
interface CampaignCellResult<TArtifact> {
|
|
295
|
-
cellId: string;
|
|
296
|
-
scenarioId: string;
|
|
297
|
-
rep: number;
|
|
298
|
-
generation?: number;
|
|
299
|
-
artifact: TArtifact;
|
|
300
|
-
judgeScores: Record<string, JudgeScore>;
|
|
301
|
-
costUsd: number;
|
|
302
|
-
durationMs: number;
|
|
303
|
-
seed: number;
|
|
304
|
-
cached: boolean;
|
|
305
|
-
error?: string;
|
|
306
|
-
}
|
|
307
|
-
interface JudgeAggregate {
|
|
308
|
-
mean: number;
|
|
309
|
-
stdev: number;
|
|
310
|
-
ci95: [number, number];
|
|
311
|
-
n: number;
|
|
312
|
-
}
|
|
313
|
-
interface ScenarioAggregate {
|
|
314
|
-
meanComposite: number;
|
|
315
|
-
ci95: [number, number];
|
|
316
|
-
n: number;
|
|
317
|
-
}
|
|
318
|
-
interface GenerationRecord {
|
|
319
|
-
generationIndex: number;
|
|
320
|
-
candidates: Array<{
|
|
321
|
-
surfaceHash: string;
|
|
322
|
-
composite: number;
|
|
323
|
-
ci95: [number, number];
|
|
324
|
-
}>;
|
|
325
|
-
promoted: string[];
|
|
326
|
-
}
|
|
327
|
-
interface CampaignAggregates {
|
|
328
|
-
byJudge: Record<string, JudgeAggregate>;
|
|
329
|
-
byScenario: Record<string, ScenarioAggregate>;
|
|
330
|
-
totalCostUsd: number;
|
|
331
|
-
cellsExecuted: number;
|
|
332
|
-
cellsSkipped: number;
|
|
333
|
-
cellsCached: number;
|
|
334
|
-
cellsFailed: number;
|
|
335
|
-
}
|
|
336
|
-
interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {
|
|
337
|
-
/** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */
|
|
338
|
-
manifestHash: string;
|
|
339
|
-
seed: number;
|
|
340
|
-
startedAt: string;
|
|
341
|
-
endedAt: string;
|
|
342
|
-
durationMs: number;
|
|
343
|
-
cells: Array<CampaignCellResult<TArtifact>>;
|
|
344
|
-
aggregates: CampaignAggregates;
|
|
345
|
-
optimization?: {
|
|
346
|
-
generations: GenerationRecord[];
|
|
347
|
-
winnerSurfaceHash?: string;
|
|
348
|
-
};
|
|
349
|
-
gate?: GateResult;
|
|
350
|
-
prUrl?: string;
|
|
351
|
-
runDir: string;
|
|
352
|
-
artifactsByPath: Record<string, string>;
|
|
353
|
-
/** Substrate strips the input scenarios to id+kind for the result manifest;
|
|
354
|
-
* consumers needing full payload look it up via the original input. The
|
|
355
|
-
* type parameter `TScenario` is propagated for downstream consumers that
|
|
356
|
-
* want narrowed types when extending `CampaignResult`. */
|
|
357
|
-
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
358
|
-
}
|
|
359
|
-
|
|
360
11
|
/**
|
|
361
12
|
* @experimental
|
|
362
13
|
*
|
|
@@ -428,6 +79,48 @@ interface EvolutionaryDriverOptions<TFindings = unknown> {
|
|
|
428
79
|
}
|
|
429
80
|
declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
|
|
430
81
|
|
|
82
|
+
/**
|
|
83
|
+
* @experimental
|
|
84
|
+
*
|
|
85
|
+
* `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
|
|
86
|
+
* Each generation it reflects on the prior best candidate's per-scenario
|
|
87
|
+
* scores + weakest dimensions (the `GenerationCandidate` evidence from
|
|
88
|
+
* `runOptimization`), asks an LLM to propose targeted rewrites of the current
|
|
89
|
+
* surface, and returns them as the next population.
|
|
90
|
+
*
|
|
91
|
+
* This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
|
|
92
|
+
* ANY string surface in ANY consumer opts in by selecting it — system prompts,
|
|
93
|
+
* prompt addenda, judge/reviewer prompts, even a driver's own reflection
|
|
94
|
+
* prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
|
|
95
|
+
* `parseReflectionResponse`) and the router client; it has NO dependency on the
|
|
96
|
+
* legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
|
|
97
|
+
*
|
|
98
|
+
* It earns its keep where there is real per-instance signal (which the
|
|
99
|
+
* dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
|
|
100
|
+
* now provide). For thin-signal surfaces it degrades to plain reflection — so
|
|
101
|
+
* it is a SELECTABLE driver, never a forced default. On generation 0 (no
|
|
102
|
+
* history) it reflects on the current surface against the mutation primitives
|
|
103
|
+
* alone.
|
|
104
|
+
*/
|
|
105
|
+
|
|
106
|
+
interface GepaDriverOptions {
|
|
107
|
+
/** Router transport (apiKey/baseUrl). */
|
|
108
|
+
llm: LlmClientOptions;
|
|
109
|
+
/** Model that performs the reflection. */
|
|
110
|
+
model: string;
|
|
111
|
+
/** What is being optimized — appears in the reflection prompt for orientation. */
|
|
112
|
+
target: string;
|
|
113
|
+
/** Surface-specific mutation levers offered to the model. */
|
|
114
|
+
mutationPrimitives?: string[];
|
|
115
|
+
/** Top/bottom scenarios surfaced as evidence each generation. Default 3. */
|
|
116
|
+
evidenceK?: number;
|
|
117
|
+
/** Reflection sampling temperature. Default 0.7. */
|
|
118
|
+
temperature?: number;
|
|
119
|
+
/** Reflection max tokens. Default 6000. */
|
|
120
|
+
maxTokens?: number;
|
|
121
|
+
}
|
|
122
|
+
declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
|
|
123
|
+
|
|
431
124
|
/**
|
|
432
125
|
* @experimental
|
|
433
126
|
*
|
|
@@ -783,4 +476,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
|
|
|
783
476
|
* as a ref under the adapter's worktree dir. */
|
|
784
477
|
declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
|
|
785
478
|
|
|
786
|
-
export {
|
|
479
|
+
export { CampaignResult, CampaignTraceWriter, CodeSurface, type DefaultProductionGateOptions, DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, Gate, GateResult, GenerationRecord, type GepaDriverOptions, type GitWorktreeAdapterOptions, type HeldOutGateOptions, ImprovementDriver, JudgeConfig, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, Scenario, type Worktree, type WorktreeAdapter, WorktreeAdapterError, composeGate, defaultProductionGate, evolutionaryDriver, gepaDriver, gitWorktreeAdapter, heldOutGate, openAutoPr, resolveWorktreePath, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };
|
package/dist/campaign/index.js
CHANGED
|
@@ -2,14 +2,20 @@ import {
|
|
|
2
2
|
runCampaign
|
|
3
3
|
} from "../chunk-YNMCYUWT.js";
|
|
4
4
|
import {
|
|
5
|
+
buildReflectionPrompt,
|
|
6
|
+
parseReflectionResponse,
|
|
5
7
|
runCanaries,
|
|
6
8
|
scoreRedTeamOutput
|
|
7
|
-
} from "../chunk-
|
|
9
|
+
} from "../chunk-N4SBKEPJ.js";
|
|
8
10
|
import {
|
|
9
11
|
detectRewardHacking
|
|
10
12
|
} from "../chunk-YV7J7X5N.js";
|
|
11
13
|
import "../chunk-WP7SY7AI.js";
|
|
12
14
|
import "../chunk-GGE4NNQT.js";
|
|
15
|
+
import {
|
|
16
|
+
callLlm
|
|
17
|
+
} from "../chunk-VXNVVBZO.js";
|
|
18
|
+
import "../chunk-PC4UYEBM.js";
|
|
13
19
|
import "../chunk-QYJT52YW.js";
|
|
14
20
|
import "../chunk-PZ5AY32C.js";
|
|
15
21
|
|
|
@@ -141,6 +147,65 @@ function evolutionaryDriver(opts) {
|
|
|
141
147
|
};
|
|
142
148
|
}
|
|
143
149
|
|
|
150
|
+
// src/campaign/drivers/gepa.ts
|
|
151
|
+
var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
|
|
152
|
+
function gepaDriver(opts) {
|
|
153
|
+
const evidenceK = opts.evidenceK ?? 3;
|
|
154
|
+
return {
|
|
155
|
+
kind: "gepa",
|
|
156
|
+
async propose(ctx) {
|
|
157
|
+
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
158
|
+
const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
|
|
159
|
+
const userPrompt = buildReflectionPrompt({
|
|
160
|
+
target,
|
|
161
|
+
parentPayload: parent,
|
|
162
|
+
topTrials: top,
|
|
163
|
+
bottomTrials: bottom,
|
|
164
|
+
childCount: ctx.populationSize,
|
|
165
|
+
mutationPrimitives: opts.mutationPrimitives
|
|
166
|
+
});
|
|
167
|
+
const result = await callLlm(
|
|
168
|
+
{
|
|
169
|
+
model: opts.model,
|
|
170
|
+
messages: [
|
|
171
|
+
{ role: "system", content: REFLECTION_SYSTEM },
|
|
172
|
+
{ role: "user", content: userPrompt }
|
|
173
|
+
],
|
|
174
|
+
jsonMode: true,
|
|
175
|
+
temperature: opts.temperature ?? 0.7,
|
|
176
|
+
maxTokens: opts.maxTokens ?? 6e3
|
|
177
|
+
},
|
|
178
|
+
opts.llm
|
|
179
|
+
);
|
|
180
|
+
const proposals = parseReflectionResponse(result.content, ctx.populationSize);
|
|
181
|
+
const out = [];
|
|
182
|
+
for (const proposal of proposals) {
|
|
183
|
+
const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
|
|
184
|
+
if (text && text !== parent && !out.includes(text)) out.push(text);
|
|
185
|
+
}
|
|
186
|
+
return out;
|
|
187
|
+
}
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
function buildEvidence(ctx, evidenceK, baseTarget) {
|
|
191
|
+
const last = ctx.history.at(-1);
|
|
192
|
+
if (!last || last.candidates.length === 0) {
|
|
193
|
+
return { top: [], bottom: [], target: baseTarget };
|
|
194
|
+
}
|
|
195
|
+
const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
|
|
196
|
+
if (!best) return { top: [], bottom: [], target: baseTarget };
|
|
197
|
+
const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
|
|
198
|
+
const toTrace = (s) => ({
|
|
199
|
+
id: s.scenarioId,
|
|
200
|
+
score: s.composite
|
|
201
|
+
});
|
|
202
|
+
const top = byScore.slice(0, evidenceK).map(toTrace);
|
|
203
|
+
const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
|
|
204
|
+
const weakest = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, 3).map(([dim, value]) => `${dim} (${value.toFixed(2)})`);
|
|
205
|
+
const target = weakest.length > 0 ? `${baseTarget} \u2014 weakest dimensions: ${weakest.join(", ")}` : baseTarget;
|
|
206
|
+
return { top, bottom, target };
|
|
207
|
+
}
|
|
208
|
+
|
|
144
209
|
// src/campaign/gates/compose.ts
|
|
145
210
|
function composeGate(...gates) {
|
|
146
211
|
if (gates.length === 0) {
|
|
@@ -595,11 +660,16 @@ async function runOptimization(opts) {
|
|
|
595
660
|
}
|
|
596
661
|
const record = {
|
|
597
662
|
generationIndex: gen,
|
|
598
|
-
candidates: surfaceResults.map((s) =>
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
663
|
+
candidates: surfaceResults.map((s) => {
|
|
664
|
+
const breakdown = candidateBreakdown(s.campaign);
|
|
665
|
+
return {
|
|
666
|
+
surfaceHash: s.surfaceHash,
|
|
667
|
+
composite: s.composite,
|
|
668
|
+
ci95: [s.composite, s.composite],
|
|
669
|
+
dimensions: breakdown.dimensions,
|
|
670
|
+
scenarios: breakdown.scenarios
|
|
671
|
+
};
|
|
672
|
+
}),
|
|
603
673
|
promoted: promoted.map((p) => p.surfaceHash)
|
|
604
674
|
};
|
|
605
675
|
history.push(record);
|
|
@@ -637,6 +707,35 @@ function meanComposite2(campaign) {
|
|
|
637
707
|
}
|
|
638
708
|
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
639
709
|
}
|
|
710
|
+
function candidateBreakdown(campaign) {
|
|
711
|
+
const dimSums = {};
|
|
712
|
+
const dimCounts = {};
|
|
713
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
714
|
+
for (const cell of campaign.cells) {
|
|
715
|
+
const judgeScores = Object.values(cell.judgeScores);
|
|
716
|
+
if (judgeScores.length === 0) continue;
|
|
717
|
+
const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
|
|
718
|
+
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
719
|
+
arr.push(cellComposite);
|
|
720
|
+
byScenario.set(cell.scenarioId, arr);
|
|
721
|
+
for (const score of judgeScores) {
|
|
722
|
+
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
723
|
+
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
724
|
+
dimCounts[key] = (dimCounts[key] ?? 0) + 1;
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
const dimensions = {};
|
|
729
|
+
for (const key of Object.keys(dimSums)) {
|
|
730
|
+
const count = dimCounts[key] ?? 0;
|
|
731
|
+
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
732
|
+
}
|
|
733
|
+
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
|
|
734
|
+
scenarioId,
|
|
735
|
+
composite: comps.reduce((a, b) => a + b, 0) / comps.length
|
|
736
|
+
}));
|
|
737
|
+
return { dimensions, scenarios };
|
|
738
|
+
}
|
|
640
739
|
|
|
641
740
|
// src/campaign/presets/run-improvement-loop.ts
|
|
642
741
|
async function runImprovementLoop(opts) {
|
|
@@ -794,6 +893,7 @@ export {
|
|
|
794
893
|
composeGate,
|
|
795
894
|
defaultProductionGate,
|
|
796
895
|
evolutionaryDriver,
|
|
896
|
+
gepaDriver,
|
|
797
897
|
gitWorktreeAdapter,
|
|
798
898
|
heldOutGate,
|
|
799
899
|
openAutoPr,
|