@tangle-network/agent-eval 0.44.1 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +138 -0
- package/dist/adapters/http.js +203 -0
- package/dist/adapters/http.js.map +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js.map +1 -1
- package/dist/campaign/index.d.ts +3 -3
- package/dist/campaign/index.js +2 -2
- package/dist/{chunk-H5BGRSN4.js → chunk-HRKOCLQA.js} +3 -3
- package/dist/{chunk-RXK7FXLV.js → chunk-J3EIOI3O.js} +7 -2
- package/dist/chunk-J3EIOI3O.js.map +1 -0
- package/dist/contract/index.d.ts +199 -2
- package/dist/contract/index.js +128 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/{run-campaign-GNDO66B4.js → run-campaign-6UEVBPP3.js} +2 -2
- package/dist/{run-improvement-loop-CbilHQAb.d.ts → run-improvement-loop-Bfam3MT1.d.ts} +18 -2
- package/dist/{types-DToGONFA.d.ts → types-8u72Gc76.d.ts} +9 -1
- package/docs/adapters-observability.md +121 -0
- package/docs/design/external-agent-wedge.md +2 -2
- package/docs/distributed-driver.md +173 -0
- package/docs/phase-b-pairing-kit.md +188 -0
- package/docs/phase-b-runbook.md +176 -0
- package/docs/quickstart-external.md +43 -4
- package/package.json +6 -1
- package/dist/chunk-RXK7FXLV.js.map +0 -1
- /package/dist/{chunk-H5BGRSN4.js.map → chunk-HRKOCLQA.js.map} +0 -0
- /package/dist/{run-campaign-GNDO66B4.js.map → run-campaign-6UEVBPP3.js.map} +0 -0
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
export { C as
|
|
1
|
+
import { S as Scenario, M as MutableSurface, D as DispatchContext, J as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-8u72Gc76.js';
|
|
2
|
+
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, g as Dispatch, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, m as JudgeDimension, n as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-8u72Gc76.js';
|
|
3
|
+
import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bfam3MT1.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-Bfam3MT1.js';
|
|
3
5
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
|
|
4
6
|
import '../llm-client-BXVRUZyX.js';
|
|
5
7
|
import '../errors-mje_cKOs.js';
|
|
@@ -8,3 +10,198 @@ import '@tangle-network/agent-runtime';
|
|
|
8
10
|
import '../red-team-30II1T4o.js';
|
|
9
11
|
import '../dataset-BlwAtYYf.js';
|
|
10
12
|
import '../store-Db2Bv8Cf.js';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* # `selfImprove()` — the LAND-tier one-shot.
|
|
16
|
+
*
|
|
17
|
+
* The cheapest possible call site to run a real closed-loop self-
|
|
18
|
+
* improvement over your agent. Wraps `runImprovementLoop` with smart
|
|
19
|
+
* defaults and a budget-shaped options API; every escape hatch the
|
|
20
|
+
* substrate exposes is reachable from here without losing the
|
|
21
|
+
* one-function feel.
|
|
22
|
+
*
|
|
23
|
+
* Defaults picked to match the LAND-tier story:
|
|
24
|
+
* - In-memory storage (no filesystem touch).
|
|
25
|
+
* - `gepaDriver` reflective mutation with copywriting-flavored primitives
|
|
26
|
+
* (override `driver` or `mutationPrimitives` for any domain).
|
|
27
|
+
* - `defaultProductionGate` with `deltaThreshold: 0.05`.
|
|
28
|
+
* - Held-out split = 25% of scenarios, deterministic by id hash.
|
|
29
|
+
* - 3 generations × population 2 (raise via `budget` for more search).
|
|
30
|
+
* - `autoOnPromote: 'none'` (we don't open PRs unless you ask).
|
|
31
|
+
*
|
|
32
|
+
* Want one-click? Provide `agent` + `scenarios` + `judge`. Done.
|
|
33
|
+
* Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed
|
|
34
|
+
* agent. Want a code-tier surface? Pass a `MutableSurface` + your own
|
|
35
|
+
* `driver`. Same function.
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
interface SelfImproveBudget {
|
|
39
|
+
/** Hard $ ceiling across all cells in baseline + every generation. Cells
|
|
40
|
+
* beyond the ceiling are skipped (cost-aware, not aborted). */
|
|
41
|
+
dollars?: number;
|
|
42
|
+
/** How many improvement generations to explore. Default 3. Set 0 to
|
|
43
|
+
* skip improvement entirely (selfImprove becomes a baseline-only run). */
|
|
44
|
+
generations?: number;
|
|
45
|
+
/** Candidates the driver proposes per generation. Default 2. */
|
|
46
|
+
populationSize?: number;
|
|
47
|
+
/** Max concurrent cells across the loop. Default 2. */
|
|
48
|
+
maxConcurrency?: number;
|
|
49
|
+
/** Fraction of `scenarios` held out from training, used for the gate.
|
|
50
|
+
* Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */
|
|
51
|
+
holdoutFraction?: number;
|
|
52
|
+
/** Explicit held-out scenarios; overrides `holdoutFraction`. */
|
|
53
|
+
holdoutScenarios?: Scenario[];
|
|
54
|
+
}
|
|
55
|
+
interface SelfImproveLlm {
|
|
56
|
+
/** Endpoint base URL. Default Tangle Router. */
|
|
57
|
+
baseUrl?: string;
|
|
58
|
+
/** Bearer token. Default `process.env.OPENAI_API_KEY`. */
|
|
59
|
+
apiKey?: string;
|
|
60
|
+
/** Model id used by `gepaDriver` reflection. Default
|
|
61
|
+
* `anthropic/claude-sonnet-4.6`. */
|
|
62
|
+
model?: string;
|
|
63
|
+
}
|
|
64
|
+
type SelfImproveProgressEvent = {
|
|
65
|
+
kind: 'baseline.started';
|
|
66
|
+
scenarios: number;
|
|
67
|
+
} | {
|
|
68
|
+
kind: 'baseline.completed';
|
|
69
|
+
compositeMean: number;
|
|
70
|
+
durationMs: number;
|
|
71
|
+
} | {
|
|
72
|
+
kind: 'generation.started';
|
|
73
|
+
index: number;
|
|
74
|
+
populationSize: number;
|
|
75
|
+
} | {
|
|
76
|
+
kind: 'generation.completed';
|
|
77
|
+
index: number;
|
|
78
|
+
bestComposite: number;
|
|
79
|
+
durationMs: number;
|
|
80
|
+
} | {
|
|
81
|
+
kind: 'gate.decided';
|
|
82
|
+
decision: string;
|
|
83
|
+
lift: number;
|
|
84
|
+
};
|
|
85
|
+
interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
|
|
86
|
+
/**
|
|
87
|
+
* Your agent — a function that takes the current `MutableSurface`
|
|
88
|
+
* (typically a system prompt the loop is optimizing) plus the
|
|
89
|
+
* scenario + cell ctx, and returns the artifact your judge scores.
|
|
90
|
+
*
|
|
91
|
+
* Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a
|
|
92
|
+
* plain `Dispatch` if you don't have a surface seam:
|
|
93
|
+
*
|
|
94
|
+
* agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)
|
|
95
|
+
*
|
|
96
|
+
* That mode evaluates without mutating any surface — useful as a
|
|
97
|
+
* baseline-only run (set `budget.generations = 0`).
|
|
98
|
+
*/
|
|
99
|
+
agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
100
|
+
/** Scenarios to evaluate against. Train/holdout split is computed from
|
|
101
|
+
* these unless `budget.holdoutScenarios` is set explicitly. */
|
|
102
|
+
scenarios: TScenario[];
|
|
103
|
+
/** Judge that scores artifacts. Bring your own; use `langchainJudge`
|
|
104
|
+
* from `/adapters/langchain` for a Runnable-shaped one. */
|
|
105
|
+
judge: JudgeConfig<TArtifact, TScenario>;
|
|
106
|
+
/** Starting surface — system prompt, JSON config, anything `MutableSurface`
|
|
107
|
+
* accepts. The driver mutates this each generation. */
|
|
108
|
+
baselineSurface: MutableSurface;
|
|
109
|
+
/** Budget + loop shape. All fields optional; defaults pick the LAND-tier
|
|
110
|
+
* story. */
|
|
111
|
+
budget?: SelfImproveBudget;
|
|
112
|
+
/** Custom driver. Default is `gepaDriver` configured from `llm` +
|
|
113
|
+
* `mutationPrimitives`. */
|
|
114
|
+
driver?: ImprovementDriver;
|
|
115
|
+
/** Default-driver overrides — used when `driver` is unset. */
|
|
116
|
+
mutationPrimitives?: string[];
|
|
117
|
+
driverTarget?: string;
|
|
118
|
+
/** Custom gate. Default is `defaultProductionGate` with
|
|
119
|
+
* `deltaThreshold: 0.05` on the held-out split. */
|
|
120
|
+
gate?: Gate<TArtifact, TScenario>;
|
|
121
|
+
/** LLM config consumed by the default `gepaDriver`. Ignored if you pass
|
|
122
|
+
* your own `driver`. */
|
|
123
|
+
llm?: SelfImproveLlm;
|
|
124
|
+
/** Storage backend. Default `inMemoryCampaignStorage()` — nothing
|
|
125
|
+
* persists past the call. Pass `fsCampaignStorage()` to write to disk. */
|
|
126
|
+
storage?: CampaignStorage;
|
|
127
|
+
/** Run directory (logical for in-memory storage, real path for fs).
|
|
128
|
+
* Default `mem://selfImprove-<timestamp>`. */
|
|
129
|
+
runDir?: string;
|
|
130
|
+
/** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
|
|
131
|
+
* Returns an opaque placement key the substrate forwards to your agent
|
|
132
|
+
* as `ctx.placement`. Combined with `httpDispatch` from
|
|
133
|
+
* `/adapters/http`, fans cells across regions. */
|
|
134
|
+
cellPlacement?: (input: {
|
|
135
|
+
scenario: TScenario;
|
|
136
|
+
rep: number;
|
|
137
|
+
generation?: number;
|
|
138
|
+
}) => string | undefined;
|
|
139
|
+
/** Streaming hook — fires on baseline + each generation + gate decision.
|
|
140
|
+
* Consumer routes events wherever (UI, dashboard, logs). */
|
|
141
|
+
onProgress?: (event: SelfImproveProgressEvent) => void;
|
|
142
|
+
/** Auto-promotion behavior on a ship decision. Default `'none'` — we
|
|
143
|
+
* return the winner; you ship it however you ship. `'pr'` opens a
|
|
144
|
+
* GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */
|
|
145
|
+
autoOnPromote?: 'pr' | 'none';
|
|
146
|
+
ghOwner?: string;
|
|
147
|
+
ghRepo?: string;
|
|
148
|
+
}
|
|
149
|
+
interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
|
|
150
|
+
/** Composite mean across all scenarios, baseline run. */
|
|
151
|
+
baseline: {
|
|
152
|
+
compositeMean: number;
|
|
153
|
+
perScenario: Record<string, number>;
|
|
154
|
+
};
|
|
155
|
+
/** Composite mean on the held-out set, winner run. */
|
|
156
|
+
winner: {
|
|
157
|
+
compositeMean: number;
|
|
158
|
+
perScenario: Record<string, number>;
|
|
159
|
+
surface: MutableSurface;
|
|
160
|
+
};
|
|
161
|
+
/** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
|
|
162
|
+
* means the gate observed improvement. */
|
|
163
|
+
lift: number;
|
|
164
|
+
/** `defaultProductionGate.decide()` result. */
|
|
165
|
+
gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
|
|
166
|
+
/** Number of generations actually explored (may be less than the
|
|
167
|
+
* budget if the driver gave up early). */
|
|
168
|
+
generationsExplored: number;
|
|
169
|
+
/** Wall-clock total. */
|
|
170
|
+
durationMs: number;
|
|
171
|
+
/** Total cost across baseline + every generation. */
|
|
172
|
+
totalCostUsd: number;
|
|
173
|
+
/**
|
|
174
|
+
* Raw substrate result for advanced inspection — full per-generation
|
|
175
|
+
* candidates, full campaign artifacts, all judge scores. Useful for
|
|
176
|
+
* debugging or reporting beyond the summary.
|
|
177
|
+
*/
|
|
178
|
+
raw: RunImprovementLoopResult<TArtifact, TScenario>;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* One-shot self-improvement loop. See module docstring for defaults +
|
|
182
|
+
* extension points.
|
|
183
|
+
*
|
|
184
|
+
* @example Minimum (LAND tier):
|
|
185
|
+
*
|
|
186
|
+
* const result = await selfImprove({
|
|
187
|
+
* agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),
|
|
188
|
+
* scenarios,
|
|
189
|
+
* judge,
|
|
190
|
+
* baselineSurface: DEFAULT_PROMPT,
|
|
191
|
+
* })
|
|
192
|
+
* console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
|
|
193
|
+
*
|
|
194
|
+
* @example Distributed (workers in three regions):
|
|
195
|
+
*
|
|
196
|
+
* await selfImprove({
|
|
197
|
+
* agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),
|
|
198
|
+
* scenarios,
|
|
199
|
+
* judge,
|
|
200
|
+
* baselineSurface: DEFAULT_PROMPT,
|
|
201
|
+
* cellPlacement: ({ scenario }) => scenario.region,
|
|
202
|
+
* budget: { maxConcurrency: 12 },
|
|
203
|
+
* })
|
|
204
|
+
*/
|
|
205
|
+
declare function selfImprove<TScenario extends Scenario, TArtifact>(opts: SelfImproveOptions<TScenario, TArtifact>): Promise<SelfImproveResult<TScenario, TArtifact>>;
|
|
206
|
+
|
|
207
|
+
export { CampaignStorage, DispatchContext, Gate, ImprovementDriver, JudgeConfig, MutableSurface, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, selfImprove };
|
package/dist/contract/index.js
CHANGED
|
@@ -6,12 +6,12 @@ import {
|
|
|
6
6
|
heldOutGate,
|
|
7
7
|
runEval,
|
|
8
8
|
runImprovementLoop
|
|
9
|
-
} from "../chunk-
|
|
9
|
+
} from "../chunk-HRKOCLQA.js";
|
|
10
10
|
import {
|
|
11
11
|
fsCampaignStorage,
|
|
12
12
|
inMemoryCampaignStorage,
|
|
13
13
|
runCampaign
|
|
14
|
-
} from "../chunk-
|
|
14
|
+
} from "../chunk-J3EIOI3O.js";
|
|
15
15
|
import "../chunk-N4SBKEPJ.js";
|
|
16
16
|
import "../chunk-YV7J7X5N.js";
|
|
17
17
|
import {
|
|
@@ -24,6 +24,130 @@ import "../chunk-VXNVVBZO.js";
|
|
|
24
24
|
import "../chunk-PC4UYEBM.js";
|
|
25
25
|
import "../chunk-QYJT52YW.js";
|
|
26
26
|
import "../chunk-NSBPE2FW.js";
|
|
27
|
+
|
|
28
|
+
// src/contract/self-improve.ts
|
|
29
|
+
function splitTrainHoldout(scenarios, fraction) {
|
|
30
|
+
function hash(s) {
|
|
31
|
+
let h = 2166136261 >>> 0;
|
|
32
|
+
for (let i = 0; i < s.length; i++) {
|
|
33
|
+
h ^= s.charCodeAt(i);
|
|
34
|
+
h = Math.imul(h, 16777619) >>> 0;
|
|
35
|
+
}
|
|
36
|
+
return h;
|
|
37
|
+
}
|
|
38
|
+
const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id));
|
|
39
|
+
const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)));
|
|
40
|
+
return {
|
|
41
|
+
holdout: sorted.slice(0, nHoldout),
|
|
42
|
+
train: sorted.slice(nHoldout)
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
function meanComposite(byScenario) {
|
|
46
|
+
const perScenario = {};
|
|
47
|
+
const values = [];
|
|
48
|
+
for (const [id, agg] of Object.entries(byScenario)) {
|
|
49
|
+
perScenario[id] = agg.meanComposite;
|
|
50
|
+
values.push(agg.meanComposite);
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,
|
|
54
|
+
perScenario
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
58
|
+
"Tighten the hook: lead with the specific user outcome.",
|
|
59
|
+
"Replace generic adjectives with specific verbs or proof numbers.",
|
|
60
|
+
"Anchor every claim in something the scenario's brief literally supports.",
|
|
61
|
+
"Honor the surface-shape constraint (length, register, audience vocabulary)."
|
|
62
|
+
];
|
|
63
|
+
async function selfImprove(opts) {
|
|
64
|
+
const startedAt = Date.now();
|
|
65
|
+
const budget = opts.budget ?? {};
|
|
66
|
+
const generations = budget.generations ?? 3;
|
|
67
|
+
const populationSize = budget.populationSize ?? 2;
|
|
68
|
+
const maxConcurrency = budget.maxConcurrency ?? 2;
|
|
69
|
+
const holdoutFraction = budget.holdoutFraction ?? 0.25;
|
|
70
|
+
const costCeiling = budget.dollars;
|
|
71
|
+
const explicitHoldout = budget.holdoutScenarios;
|
|
72
|
+
const { train, holdout } = explicitHoldout ? {
|
|
73
|
+
train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),
|
|
74
|
+
holdout: explicitHoldout
|
|
75
|
+
} : splitTrainHoldout(opts.scenarios, holdoutFraction);
|
|
76
|
+
if (train.length === 0) {
|
|
77
|
+
throw new Error("selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.");
|
|
78
|
+
}
|
|
79
|
+
if (holdout.length === 0) {
|
|
80
|
+
throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
|
|
81
|
+
}
|
|
82
|
+
const driver = opts.driver ?? gepaDriver({
|
|
83
|
+
llm: {
|
|
84
|
+
baseUrl: opts.llm?.baseUrl ?? "https://router.tangle.tools/v1",
|
|
85
|
+
apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? ""
|
|
86
|
+
},
|
|
87
|
+
model: opts.llm?.model ?? "anthropic/claude-sonnet-4.6",
|
|
88
|
+
target: opts.driverTarget ?? "agent surface (system prompt or config) being optimized by selfImprove",
|
|
89
|
+
mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES
|
|
90
|
+
});
|
|
91
|
+
const gate = opts.gate ?? defaultProductionGate({
|
|
92
|
+
holdoutScenarios: holdout,
|
|
93
|
+
deltaThreshold: 0.05
|
|
94
|
+
});
|
|
95
|
+
const storage = opts.storage ?? inMemoryCampaignStorage();
|
|
96
|
+
const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
|
|
97
|
+
if (opts.onProgress) {
|
|
98
|
+
opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
|
|
99
|
+
}
|
|
100
|
+
const result = await runImprovementLoop({
|
|
101
|
+
scenarios: train,
|
|
102
|
+
baselineSurface: opts.baselineSurface,
|
|
103
|
+
dispatchWithSurface: opts.agent,
|
|
104
|
+
driver,
|
|
105
|
+
judges: [opts.judge],
|
|
106
|
+
populationSize,
|
|
107
|
+
maxGenerations: generations,
|
|
108
|
+
holdoutScenarios: holdout,
|
|
109
|
+
gate,
|
|
110
|
+
autoOnPromote: opts.autoOnPromote ?? "none",
|
|
111
|
+
ghOwner: opts.ghOwner,
|
|
112
|
+
ghRepo: opts.ghRepo,
|
|
113
|
+
storage,
|
|
114
|
+
runDir,
|
|
115
|
+
maxConcurrency,
|
|
116
|
+
cellPlacement: opts.cellPlacement,
|
|
117
|
+
costCeiling
|
|
118
|
+
});
|
|
119
|
+
const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario);
|
|
120
|
+
const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario);
|
|
121
|
+
if (opts.onProgress) {
|
|
122
|
+
opts.onProgress({
|
|
123
|
+
kind: "baseline.completed",
|
|
124
|
+
compositeMean: baseline.compositeMean,
|
|
125
|
+
durationMs: Date.now() - startedAt
|
|
126
|
+
});
|
|
127
|
+
opts.onProgress({
|
|
128
|
+
kind: "gate.decided",
|
|
129
|
+
decision: result.gateResult.decision,
|
|
130
|
+
lift: winnerStats.compositeMean - baseline.compositeMean
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
const totalCost = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
|
|
134
|
+
(sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
|
|
135
|
+
0
|
|
136
|
+
);
|
|
137
|
+
return {
|
|
138
|
+
baseline,
|
|
139
|
+
winner: {
|
|
140
|
+
...winnerStats,
|
|
141
|
+
surface: result.winnerSurface
|
|
142
|
+
},
|
|
143
|
+
lift: winnerStats.compositeMean - baseline.compositeMean,
|
|
144
|
+
gateDecision: result.gateResult.decision,
|
|
145
|
+
generationsExplored: result.generations.length,
|
|
146
|
+
durationMs: Date.now() - startedAt,
|
|
147
|
+
totalCostUsd: totalCost,
|
|
148
|
+
raw: result
|
|
149
|
+
};
|
|
150
|
+
}
|
|
27
151
|
export {
|
|
28
152
|
FileSystemOutcomeStore,
|
|
29
153
|
InMemoryOutcomeStore,
|
|
@@ -36,6 +160,7 @@ export {
|
|
|
36
160
|
inMemoryCampaignStorage,
|
|
37
161
|
runCampaign,
|
|
38
162
|
runEval,
|
|
39
|
-
runImprovementLoop
|
|
163
|
+
runImprovementLoop,
|
|
164
|
+
selfImprove
|
|
40
165
|
};
|
|
41
166
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { runImprovementLoop, type RunImprovementLoopResult } from '../campaign/presets/run-improvement-loop'\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: DispatchContext,\n ) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(\n byScenario: Record<string, { meanComposite: number }>,\n): { compositeMean: number; perScenario: Record<string, number> } {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n 'Anchor every claim in something the scenario\\'s brief literally supports.',\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error('selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.')\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target: opts.driverTarget ?? 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n return {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8LA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cACP,YACgE;AAChE,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,mFAAmF;AAAA,EACrG;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QAAQ,KAAK,gBAAgB;AAAA,IAC7B,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QAAQ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IAC7F;AAAA,EACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.45.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { d as CampaignResult } from './types-
|
|
2
|
+
import { d as CampaignResult } from './types-8u72Gc76.js';
|
|
3
3
|
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
|
|
4
4
|
export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
|
|
5
5
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-J3EIOI3O.js";
|
|
4
4
|
import "./chunk-WP7SY7AI.js";
|
|
5
5
|
import "./chunk-QYJT52YW.js";
|
|
6
6
|
import "./chunk-NSBPE2FW.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-6UEVBPP3.js.map
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate,
|
|
1
|
+
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
3
|
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
4
4
|
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
@@ -267,6 +267,22 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
|
|
|
267
267
|
* (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
|
|
268
268
|
* produced; artifacts/traces just aren't persisted to disk. */
|
|
269
269
|
storage?: CampaignStorage;
|
|
270
|
+
/**
|
|
271
|
+
* Optional per-cell placement strategy. Returns an opaque string the
|
|
272
|
+
* substrate forwards as `ctx.placement` to the Dispatch — placement-aware
|
|
273
|
+
* Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route
|
|
274
|
+
* each cell to the right worker, region, or sandbox. When unset, every
|
|
275
|
+
* cell receives `ctx.placement = undefined` and behaves identically to
|
|
276
|
+
* the in-process case.
|
|
277
|
+
*
|
|
278
|
+
* @example
|
|
279
|
+
* cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'
|
|
280
|
+
*/
|
|
281
|
+
cellPlacement?: (input: {
|
|
282
|
+
scenario: TScenario;
|
|
283
|
+
rep: number;
|
|
284
|
+
generation?: number;
|
|
285
|
+
}) => string | undefined;
|
|
270
286
|
}
|
|
271
287
|
declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
|
|
272
288
|
|
|
@@ -398,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
|
|
|
398
414
|
}
|
|
399
415
|
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
400
416
|
|
|
401
|
-
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type
|
|
417
|
+
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
|
|
@@ -40,6 +40,14 @@ interface DispatchContext {
|
|
|
40
40
|
cycleId?: string;
|
|
41
41
|
/** Populated when the substrate resumed from a prior cache hit. */
|
|
42
42
|
resumedFrom?: string;
|
|
43
|
+
/**
|
|
44
|
+
* Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.
|
|
45
|
+
* The substrate forwards it through unchanged; placement-aware Dispatch
|
|
46
|
+
* implementations (e.g. `httpDispatch` from `/adapters/http`) read it to
|
|
47
|
+
* route the cell to the right worker / region / sandbox. `undefined`
|
|
48
|
+
* when no placement strategy is configured.
|
|
49
|
+
*/
|
|
50
|
+
placement?: string;
|
|
43
51
|
}
|
|
44
52
|
/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
|
|
45
53
|
* whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
|
|
@@ -364,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
364
372
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
365
373
|
}
|
|
366
374
|
|
|
367
|
-
export type { CampaignAggregates as C,
|
|
375
|
+
export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Composing agent-eval with your observability stack
|
|
2
|
+
|
|
3
|
+
`@tangle-network/agent-eval` ships its own OpenTelemetry pipeline
|
|
4
|
+
(`@tangle-network/agent-eval/telemetry`) that emits spans for every
|
|
5
|
+
cell, judge invocation, mutator proposal, and gate decision. **It's
|
|
6
|
+
just OTel** — same protocol as Langfuse SDK, OpenLLMetry, Arize
|
|
7
|
+
Phoenix, TraceAI, and the OpenTelemetry GenAI semantic conventions.
|
|
8
|
+
|
|
9
|
+
That means: if you already instrument your agent with any OTel-native
|
|
10
|
+
observability tool, the two compose **for free at the protocol layer**.
|
|
11
|
+
This doc shows the composition pattern; no agent-eval-specific adapter
|
|
12
|
+
code required.
|
|
13
|
+
|
|
14
|
+
## TL;DR — one OTel context, two emitters
|
|
15
|
+
|
|
16
|
+
1. Set up a shared OTel tracer provider in your process (or service mesh).
|
|
17
|
+
2. Configure your observability tool (TraceAI / Langfuse / OpenLLMetry /
|
|
18
|
+
Phoenix) to register its instrumentations against that provider.
|
|
19
|
+
3. Configure agent-eval's `/telemetry` exporter against the same provider.
|
|
20
|
+
4. Run a campaign. Both sets of spans land at your OTel collector.
|
|
21
|
+
5. Filter / route / fan-out at the collector layer — Jaeger, Tempo,
|
|
22
|
+
Phoenix, Langfuse cloud, your private collector, whatever.
|
|
23
|
+
|
|
24
|
+
The Tangle substrate doesn't compete with the observability tool;
|
|
25
|
+
they're orthogonal. The tool tells you *what your agent did*; the
|
|
26
|
+
substrate tells you *what the campaign / judge / mutator decided about
|
|
27
|
+
it*. Unified at the trace level, you see both as one timeline per cell.
|
|
28
|
+
|
|
29
|
+
## Per-tool notes
|
|
30
|
+
|
|
31
|
+
### TraceAI (Future-AGI)
|
|
32
|
+
|
|
33
|
+
- TS SDK auto-instruments OpenAI/Anthropic SDKs + LangChain.
|
|
34
|
+
- Compatible with the OpenTelemetry GenAI semantic conventions.
|
|
35
|
+
- Compose: register TraceAI's instrumentations on the global tracer
|
|
36
|
+
provider, then either point both at your OTLP collector or at
|
|
37
|
+
TraceAI's hosted backend if you want their UI.
|
|
38
|
+
|
|
39
|
+
### Langfuse SDK
|
|
40
|
+
|
|
41
|
+
- Larger installed base; has its own hosted product + OSS self-host.
|
|
42
|
+
- Their OpenTelemetry-compatible mode ships LLM call spans with
|
|
43
|
+
Langfuse-specific attributes preserved.
|
|
44
|
+
- Compose: register Langfuse as an OTel processor; agent-eval's
|
|
45
|
+
campaign/judge/mutator spans appear alongside the LLM calls in their
|
|
46
|
+
UI.
|
|
47
|
+
|
|
48
|
+
### OpenLLMetry (Traceloop)
|
|
49
|
+
|
|
50
|
+
- OSS auto-instrumentation library; OTel-native by design.
|
|
51
|
+
- Wide framework coverage (LangChain, LlamaIndex, Haystack, OpenAI,
|
|
52
|
+
Anthropic).
|
|
53
|
+
- Compose: set up Traceloop's exporter; agent-eval's exporter shares
|
|
54
|
+
the same trace context per cell.
|
|
55
|
+
|
|
56
|
+
### Arize Phoenix
|
|
57
|
+
|
|
58
|
+
- OSS observability backend; strong in the eval-tooling community.
|
|
59
|
+
- OTel-native ingest; renders trace + span attributes per the GenAI
|
|
60
|
+
semantic conventions.
|
|
61
|
+
- Compose: point both exporters at your local Phoenix instance. Phoenix
|
|
62
|
+
becomes the unified UI for both LLM-call traces and campaign spans.
|
|
63
|
+
|
|
64
|
+
## Wiring pattern (reference)
|
|
65
|
+
|
|
66
|
+
```ts
|
|
67
|
+
import { trace } from '@opentelemetry/api'
|
|
68
|
+
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'
|
|
69
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
|
|
70
|
+
import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'
|
|
71
|
+
|
|
72
|
+
// 1. One shared tracer provider for the process.
|
|
73
|
+
const provider = new NodeTracerProvider()
|
|
74
|
+
provider.addSpanProcessor(new SimpleSpanProcessor(
|
|
75
|
+
new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' }),
|
|
76
|
+
))
|
|
77
|
+
provider.register()
|
|
78
|
+
|
|
79
|
+
// 2. Your observability tool registers against the global provider.
|
|
80
|
+
// Example for TraceAI / OpenLLMetry / Langfuse — call their init.
|
|
81
|
+
// (See each tool's docs.)
|
|
82
|
+
|
|
83
|
+
// 3. agent-eval is already OTel-native; it picks up the same global
|
|
84
|
+
// provider. Just ensure `@tangle-network/agent-eval/telemetry` is
|
|
85
|
+
// initialized for the campaign:
|
|
86
|
+
import { setOtelExporter } from '@tangle-network/agent-eval/telemetry'
|
|
87
|
+
setOtelExporter({ kind: 'otel-global' }) // use the global provider
|
|
88
|
+
|
|
89
|
+
// 4. Run your campaign — both sets of spans land at the collector.
|
|
90
|
+
import { runEval } from '@tangle-network/agent-eval/contract'
|
|
91
|
+
await runEval({ /* ... */ })
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
That's it. No new adapter shipping required — the libs are already
|
|
95
|
+
designed to live in the same OTel ecosystem.
|
|
96
|
+
|
|
97
|
+
## When you'd want a deeper, code-level adapter
|
|
98
|
+
|
|
99
|
+
The two cases where a thin adapter would add value beyond the
|
|
100
|
+
OTel-protocol composition:
|
|
101
|
+
|
|
102
|
+
1. **Cost-aware judging.** Your observability tool's auto-instrumented
|
|
103
|
+
spans carry token counts + cost. A custom `JudgeConfig` can read
|
|
104
|
+
them via the OTel context and refuse to score artifacts that
|
|
105
|
+
exceeded a per-call budget. Easy to write yourself; we'll ship a
|
|
106
|
+
reference helper (`costAwareJudgeFromOtel`) when a partner pulls on
|
|
107
|
+
this.
|
|
108
|
+
2. **Tool-aware judging.** Your instrumentation captures the tool-call
|
|
109
|
+
sequence (`langchain.tool.invoked`, `openai.function.called`, etc.).
|
|
110
|
+
A judge that scores "did the agent use the right tool" reads those
|
|
111
|
+
spans directly. Also straightforward; helper ships when needed.
|
|
112
|
+
|
|
113
|
+
Both of these are L1-tier ergonomic helpers; the underlying composition
|
|
114
|
+
works today without them.
|
|
115
|
+
|
|
116
|
+
## What this does NOT install
|
|
117
|
+
|
|
118
|
+
No new dependencies. No new peer deps. No `@traceai/*`, no
|
|
119
|
+
`@langfuse/*`, no `@opentelemetry/*` in our manifest. You bring the
|
|
120
|
+
observability stack you want; agent-eval just emits OTel and respects
|
|
121
|
+
whatever provider is registered.
|
|
@@ -32,11 +32,11 @@ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysi
|
|
|
32
32
|
|
|
33
33
|
| Tier | What they do | What they get | Billing |
|
|
34
34
|
|---|---|---|---|
|
|
35
|
-
| **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
|
|
35
|
+
| **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) — **with optional Tangle Router as a $0-friction inference upsell.** When a builder points `OPENAI_BASE_URL` at `router.tangle.tools/v1`, every campaign call (agent + judge + reflective mutation) routes through us; we earn the routing margin. Same code, opt-in monetization vector that ships today. |
|
|
36
36
|
| **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
|
|
37
37
|
| **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
|
|
38
38
|
|
|
39
|
-
The free lib casts the widest possible net at near-zero cost (it's already published).
|
|
39
|
+
The free lib casts the widest possible net at near-zero cost (it's already published). LAND is **not actually zero-revenue** — pointing the loop at Tangle Router is a one-line config change with no other code differences, so we monetize inference for any LAND-tier adopter who opts in. The wedge ladder is therefore four steps: no-revenue install → router routing margin (LAND with router) → metered data hosting (EXPAND) → sandbox usage (PLATFORM). Each step a one-line config change, never a rewrite. Value capture concentrates at EXPAND (hosting their data/intelligence is the biggest billable surface), but LAND-with-router is the immediate upsell available from day one.
|
|
40
40
|
|
|
41
41
|
## Plan & gates — land-first, validate, then build
|
|
42
42
|
|