@tangle-network/agent-eval 0.44.1 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
- export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-DToGONFA.js';
2
- export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-CbilHQAb.js';
1
+ import { S as Scenario, M as MutableSurface, D as DispatchContext, J as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-8u72Gc76.js';
2
+ export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, g as Dispatch, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, m as JudgeDimension, n as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-8u72Gc76.js';
3
+ import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bfam3MT1.js';
4
+ export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-Bfam3MT1.js';
3
5
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
4
6
  import '../llm-client-BXVRUZyX.js';
5
7
  import '../errors-mje_cKOs.js';
@@ -8,3 +10,198 @@ import '@tangle-network/agent-runtime';
8
10
  import '../red-team-30II1T4o.js';
9
11
  import '../dataset-BlwAtYYf.js';
10
12
  import '../store-Db2Bv8Cf.js';
13
+
14
+ /**
15
+ * # `selfImprove()` — the LAND-tier one-shot.
16
+ *
17
+ * The cheapest possible call site to run a real closed-loop self-
18
+ * improvement over your agent. Wraps `runImprovementLoop` with smart
19
+ * defaults and a budget-shaped options API; every escape hatch the
20
+ * substrate exposes is reachable from here without losing the
21
+ * one-function feel.
22
+ *
23
+ * Defaults picked to match the LAND-tier story:
24
+ * - In-memory storage (no filesystem touch).
25
+ * - `gepaDriver` reflective mutation with copywriting-flavored primitives
26
+ * (override `driver` or `mutationPrimitives` for any domain).
27
+ * - `defaultProductionGate` with `deltaThreshold: 0.05`.
28
+ * - Held-out split = 25% of scenarios, deterministic by id hash.
29
+ * - 3 generations × population 2 (raise via `budget` for more search).
30
+ * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).
31
+ *
32
+ * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.
33
+ * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed
34
+ * agent. Want a code-tier surface? Pass a `MutableSurface` + your own
35
+ * `driver`. Same function.
36
+ */
37
+
38
+ interface SelfImproveBudget {
39
+ /** Hard $ ceiling across all cells in baseline + every generation. Cells
40
+ * beyond the ceiling are skipped (cost-aware, not aborted). */
41
+ dollars?: number;
42
+ /** How many improvement generations to explore. Default 3. Set 0 to
43
+ * skip improvement entirely (selfImprove becomes a baseline-only run). */
44
+ generations?: number;
45
+ /** Candidates the driver proposes per generation. Default 2. */
46
+ populationSize?: number;
47
+ /** Max concurrent cells across the loop. Default 2. */
48
+ maxConcurrency?: number;
49
+ /** Fraction of `scenarios` held out from training, used for the gate.
50
+ * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */
51
+ holdoutFraction?: number;
52
+ /** Explicit held-out scenarios; overrides `holdoutFraction`. */
53
+ holdoutScenarios?: Scenario[];
54
+ }
55
+ interface SelfImproveLlm {
56
+ /** Endpoint base URL. Default Tangle Router. */
57
+ baseUrl?: string;
58
+ /** Bearer token. Default `process.env.OPENAI_API_KEY`. */
59
+ apiKey?: string;
60
+ /** Model id used by `gepaDriver` reflection. Default
61
+ * `anthropic/claude-sonnet-4.6`. */
62
+ model?: string;
63
+ }
64
+ type SelfImproveProgressEvent = {
65
+ kind: 'baseline.started';
66
+ scenarios: number;
67
+ } | {
68
+ kind: 'baseline.completed';
69
+ compositeMean: number;
70
+ durationMs: number;
71
+ } | {
72
+ kind: 'generation.started';
73
+ index: number;
74
+ populationSize: number;
75
+ } | {
76
+ kind: 'generation.completed';
77
+ index: number;
78
+ bestComposite: number;
79
+ durationMs: number;
80
+ } | {
81
+ kind: 'gate.decided';
82
+ decision: string;
83
+ lift: number;
84
+ };
85
+ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
86
+ /**
87
+ * Your agent — a function that takes the current `MutableSurface`
88
+ * (typically a system prompt the loop is optimizing) plus the
89
+ * scenario + cell ctx, and returns the artifact your judge scores.
90
+ *
91
+ * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a
92
+ * plain `Dispatch` if you don't have a surface seam:
93
+ *
94
+ * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)
95
+ *
96
+ * That mode evaluates without mutating any surface — useful as a
97
+ * baseline-only run (set `budget.generations = 0`).
98
+ */
99
+ agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
100
+ /** Scenarios to evaluate against. Train/holdout split is computed from
101
+ * these unless `budget.holdoutScenarios` is set explicitly. */
102
+ scenarios: TScenario[];
103
+ /** Judge that scores artifacts. Bring your own; use `langchainJudge`
104
+ * from `/adapters/langchain` for a Runnable-shaped one. */
105
+ judge: JudgeConfig<TArtifact, TScenario>;
106
+ /** Starting surface — system prompt, JSON config, anything `MutableSurface`
107
+ * accepts. The driver mutates this each generation. */
108
+ baselineSurface: MutableSurface;
109
+ /** Budget + loop shape. All fields optional; defaults pick the LAND-tier
110
+ * story. */
111
+ budget?: SelfImproveBudget;
112
+ /** Custom driver. Default is `gepaDriver` configured from `llm` +
113
+ * `mutationPrimitives`. */
114
+ driver?: ImprovementDriver;
115
+ /** Default-driver overrides — used when `driver` is unset. */
116
+ mutationPrimitives?: string[];
117
+ driverTarget?: string;
118
+ /** Custom gate. Default is `defaultProductionGate` with
119
+ * `deltaThreshold: 0.05` on the held-out split. */
120
+ gate?: Gate<TArtifact, TScenario>;
121
+ /** LLM config consumed by the default `gepaDriver`. Ignored if you pass
122
+ * your own `driver`. */
123
+ llm?: SelfImproveLlm;
124
+ /** Storage backend. Default `inMemoryCampaignStorage()` — nothing
125
+ * persists past the call. Pass `fsCampaignStorage()` to write to disk. */
126
+ storage?: CampaignStorage;
127
+ /** Run directory (logical for in-memory storage, real path for fs).
128
+ * Default `mem://selfImprove-<timestamp>`. */
129
+ runDir?: string;
130
+ /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
131
+ * Returns an opaque placement key the substrate forwards to your agent
132
+ * as `ctx.placement`. Combined with `httpDispatch` from
133
+ * `/adapters/http`, fans cells across regions. */
134
+ cellPlacement?: (input: {
135
+ scenario: TScenario;
136
+ rep: number;
137
+ generation?: number;
138
+ }) => string | undefined;
139
+ /** Streaming hook — fires on baseline + each generation + gate decision.
140
+ * Consumer routes events wherever (UI, dashboard, logs). */
141
+ onProgress?: (event: SelfImproveProgressEvent) => void;
142
+ /** Auto-promotion behavior on a ship decision. Default `'none'` — we
143
+ * return the winner; you ship it however you ship. `'pr'` opens a
144
+ * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */
145
+ autoOnPromote?: 'pr' | 'none';
146
+ ghOwner?: string;
147
+ ghRepo?: string;
148
+ }
149
+ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
150
+ /** Composite mean across all scenarios, baseline run. */
151
+ baseline: {
152
+ compositeMean: number;
153
+ perScenario: Record<string, number>;
154
+ };
155
+ /** Composite mean on the held-out set, winner run. */
156
+ winner: {
157
+ compositeMean: number;
158
+ perScenario: Record<string, number>;
159
+ surface: MutableSurface;
160
+ };
161
+ /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
162
+ * means the gate observed improvement. */
163
+ lift: number;
164
+ /** `defaultProductionGate.decide()` result. */
165
+ gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
166
+ /** Number of generations actually explored (may be less than the
167
+ * budget if the driver gave up early). */
168
+ generationsExplored: number;
169
+ /** Wall-clock total. */
170
+ durationMs: number;
171
+ /** Total cost across baseline + every generation. */
172
+ totalCostUsd: number;
173
+ /**
174
+ * Raw substrate result for advanced inspection — full per-generation
175
+ * candidates, full campaign artifacts, all judge scores. Useful for
176
+ * debugging or reporting beyond the summary.
177
+ */
178
+ raw: RunImprovementLoopResult<TArtifact, TScenario>;
179
+ }
180
+ /**
181
+ * One-shot self-improvement loop. See module docstring for defaults +
182
+ * extension points.
183
+ *
184
+ * @example Minimum (LAND tier):
185
+ *
186
+ * const result = await selfImprove({
187
+ * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),
188
+ * scenarios,
189
+ * judge,
190
+ * baselineSurface: DEFAULT_PROMPT,
191
+ * })
192
+ * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
193
+ *
194
+ * @example Distributed (workers in three regions):
195
+ *
196
+ * await selfImprove({
197
+ * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),
198
+ * scenarios,
199
+ * judge,
200
+ * baselineSurface: DEFAULT_PROMPT,
201
+ * cellPlacement: ({ scenario }) => scenario.region,
202
+ * budget: { maxConcurrency: 12 },
203
+ * })
204
+ */
205
+ declare function selfImprove<TScenario extends Scenario, TArtifact>(opts: SelfImproveOptions<TScenario, TArtifact>): Promise<SelfImproveResult<TScenario, TArtifact>>;
206
+
207
+ export { CampaignStorage, DispatchContext, Gate, ImprovementDriver, JudgeConfig, MutableSurface, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, selfImprove };
@@ -6,12 +6,12 @@ import {
6
6
  heldOutGate,
7
7
  runEval,
8
8
  runImprovementLoop
9
- } from "../chunk-H5BGRSN4.js";
9
+ } from "../chunk-HRKOCLQA.js";
10
10
  import {
11
11
  fsCampaignStorage,
12
12
  inMemoryCampaignStorage,
13
13
  runCampaign
14
- } from "../chunk-RXK7FXLV.js";
14
+ } from "../chunk-J3EIOI3O.js";
15
15
  import "../chunk-N4SBKEPJ.js";
16
16
  import "../chunk-YV7J7X5N.js";
17
17
  import {
@@ -24,6 +24,130 @@ import "../chunk-VXNVVBZO.js";
24
24
  import "../chunk-PC4UYEBM.js";
25
25
  import "../chunk-QYJT52YW.js";
26
26
  import "../chunk-NSBPE2FW.js";
27
+
28
+ // src/contract/self-improve.ts
29
+ function splitTrainHoldout(scenarios, fraction) {
30
+ function hash(s) {
31
+ let h = 2166136261 >>> 0;
32
+ for (let i = 0; i < s.length; i++) {
33
+ h ^= s.charCodeAt(i);
34
+ h = Math.imul(h, 16777619) >>> 0;
35
+ }
36
+ return h;
37
+ }
38
+ const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id));
39
+ const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)));
40
+ return {
41
+ holdout: sorted.slice(0, nHoldout),
42
+ train: sorted.slice(nHoldout)
43
+ };
44
+ }
45
+ function meanComposite(byScenario) {
46
+ const perScenario = {};
47
+ const values = [];
48
+ for (const [id, agg] of Object.entries(byScenario)) {
49
+ perScenario[id] = agg.meanComposite;
50
+ values.push(agg.meanComposite);
51
+ }
52
+ return {
53
+ compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,
54
+ perScenario
55
+ };
56
+ }
57
+ var DEFAULT_MUTATION_PRIMITIVES = [
58
+ "Tighten the hook: lead with the specific user outcome.",
59
+ "Replace generic adjectives with specific verbs or proof numbers.",
60
+ "Anchor every claim in something the scenario's brief literally supports.",
61
+ "Honor the surface-shape constraint (length, register, audience vocabulary)."
62
+ ];
63
+ async function selfImprove(opts) {
64
+ const startedAt = Date.now();
65
+ const budget = opts.budget ?? {};
66
+ const generations = budget.generations ?? 3;
67
+ const populationSize = budget.populationSize ?? 2;
68
+ const maxConcurrency = budget.maxConcurrency ?? 2;
69
+ const holdoutFraction = budget.holdoutFraction ?? 0.25;
70
+ const costCeiling = budget.dollars;
71
+ const explicitHoldout = budget.holdoutScenarios;
72
+ const { train, holdout } = explicitHoldout ? {
73
+ train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),
74
+ holdout: explicitHoldout
75
+ } : splitTrainHoldout(opts.scenarios, holdoutFraction);
76
+ if (train.length === 0) {
77
+ throw new Error("selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.");
78
+ }
79
+ if (holdout.length === 0) {
80
+ throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
81
+ }
82
+ const driver = opts.driver ?? gepaDriver({
83
+ llm: {
84
+ baseUrl: opts.llm?.baseUrl ?? "https://router.tangle.tools/v1",
85
+ apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? ""
86
+ },
87
+ model: opts.llm?.model ?? "anthropic/claude-sonnet-4.6",
88
+ target: opts.driverTarget ?? "agent surface (system prompt or config) being optimized by selfImprove",
89
+ mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES
90
+ });
91
+ const gate = opts.gate ?? defaultProductionGate({
92
+ holdoutScenarios: holdout,
93
+ deltaThreshold: 0.05
94
+ });
95
+ const storage = opts.storage ?? inMemoryCampaignStorage();
96
+ const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
97
+ if (opts.onProgress) {
98
+ opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
99
+ }
100
+ const result = await runImprovementLoop({
101
+ scenarios: train,
102
+ baselineSurface: opts.baselineSurface,
103
+ dispatchWithSurface: opts.agent,
104
+ driver,
105
+ judges: [opts.judge],
106
+ populationSize,
107
+ maxGenerations: generations,
108
+ holdoutScenarios: holdout,
109
+ gate,
110
+ autoOnPromote: opts.autoOnPromote ?? "none",
111
+ ghOwner: opts.ghOwner,
112
+ ghRepo: opts.ghRepo,
113
+ storage,
114
+ runDir,
115
+ maxConcurrency,
116
+ cellPlacement: opts.cellPlacement,
117
+ costCeiling
118
+ });
119
+ const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario);
120
+ const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario);
121
+ if (opts.onProgress) {
122
+ opts.onProgress({
123
+ kind: "baseline.completed",
124
+ compositeMean: baseline.compositeMean,
125
+ durationMs: Date.now() - startedAt
126
+ });
127
+ opts.onProgress({
128
+ kind: "gate.decided",
129
+ decision: result.gateResult.decision,
130
+ lift: winnerStats.compositeMean - baseline.compositeMean
131
+ });
132
+ }
133
+ const totalCost = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
134
+ (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
135
+ 0
136
+ );
137
+ return {
138
+ baseline,
139
+ winner: {
140
+ ...winnerStats,
141
+ surface: result.winnerSurface
142
+ },
143
+ lift: winnerStats.compositeMean - baseline.compositeMean,
144
+ gateDecision: result.gateResult.decision,
145
+ generationsExplored: result.generations.length,
146
+ durationMs: Date.now() - startedAt,
147
+ totalCostUsd: totalCost,
148
+ raw: result
149
+ };
150
+ }
27
151
  export {
28
152
  FileSystemOutcomeStore,
29
153
  InMemoryOutcomeStore,
@@ -36,6 +160,7 @@ export {
36
160
  inMemoryCampaignStorage,
37
161
  runCampaign,
38
162
  runEval,
39
- runImprovementLoop
163
+ runImprovementLoop,
164
+ selfImprove
40
165
  };
41
166
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
1
+ {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { runImprovementLoop, type RunImprovementLoopResult } from '../campaign/presets/run-improvement-loop'\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: DispatchContext,\n ) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(\n byScenario: Record<string, { meanComposite: number }>,\n): { compositeMean: number; perScenario: Record<string, number> } {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n 'Anchor every claim in something the scenario\\'s brief literally supports.',\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error('selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.')\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target: opts.driverTarget ?? 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n return {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8LA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cACP,YACgE;AAChE,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,mFAAmF;AAAA,EACrG;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QAAQ,KAAK,gBAAgB;AAAA,IAC7B,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QAAQ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IAC7F;AAAA,EACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.44.0",
5
+ "version": "0.45.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
package/dist/rl.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { d as CampaignResult } from './types-DToGONFA.js';
2
+ import { d as CampaignResult } from './types-8u72Gc76.js';
3
3
  import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
4
  export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-RXK7FXLV.js";
3
+ } from "./chunk-J3EIOI3O.js";
4
4
  import "./chunk-WP7SY7AI.js";
5
5
  import "./chunk-QYJT52YW.js";
6
6
  import "./chunk-NSBPE2FW.js";
7
7
  export {
8
8
  runCampaign
9
9
  };
10
- //# sourceMappingURL=run-campaign-GNDO66B4.js.map
10
+ //# sourceMappingURL=run-campaign-6UEVBPP3.js.map
@@ -1,4 +1,4 @@
1
- import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-DToGONFA.js';
1
+ import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { RunRecord } from '@tangle-network/agent-runtime';
4
4
  import { R as RedTeamCase } from './red-team-30II1T4o.js';
@@ -267,6 +267,22 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
267
267
  * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
268
268
  * produced; artifacts/traces just aren't persisted to disk. */
269
269
  storage?: CampaignStorage;
270
+ /**
271
+ * Optional per-cell placement strategy. Returns an opaque string the
272
+ * substrate forwards as `ctx.placement` to the Dispatch — placement-aware
273
+ * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route
274
+ * each cell to the right worker, region, or sandbox. When unset, every
275
+ * cell receives `ctx.placement = undefined` and behaves identically to
276
+ * the in-process case.
277
+ *
278
+ * @example
279
+ * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'
280
+ */
281
+ cellPlacement?: (input: {
282
+ scenario: TScenario;
283
+ rep: number;
284
+ generation?: number;
285
+ }) => string | undefined;
270
286
  }
271
287
  declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
272
288
 
@@ -398,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
398
414
  }
399
415
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
400
416
 
401
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type RunEvalOptions as a, type RunImprovementLoopOptions as b, type RunImprovementLoopResult as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
417
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
@@ -40,6 +40,14 @@ interface DispatchContext {
40
40
  cycleId?: string;
41
41
  /** Populated when the substrate resumed from a prior cache hit. */
42
42
  resumedFrom?: string;
43
+ /**
44
+ * Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.
45
+ * The substrate forwards it through unchanged; placement-aware Dispatch
46
+ * implementations (e.g. `httpDispatch` from `/adapters/http`) read it to
47
+ * route the cell to the right worker / region / sandbox. `undefined`
48
+ * when no placement strategy is configured.
49
+ */
50
+ placement?: string;
43
51
  }
44
52
  /** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
45
53
  * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
@@ -364,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
364
372
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
365
373
  }
366
374
 
367
- export type { CampaignAggregates as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchContext as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
375
+ export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
@@ -0,0 +1,121 @@
1
+ # Composing agent-eval with your observability stack
2
+
3
+ `@tangle-network/agent-eval` ships its own OpenTelemetry pipeline
4
+ (`@tangle-network/agent-eval/telemetry`) that emits spans for every
5
+ cell, judge invocation, mutator proposal, and gate decision. **It's
6
+ just OTel** — same protocol as Langfuse SDK, OpenLLMetry, Arize
7
+ Phoenix, TraceAI, and the OpenTelemetry GenAI semantic conventions.
8
+
9
+ That means: if you already instrument your agent with any OTel-native
10
+ observability tool, the two compose **for free at the protocol layer**.
11
+ This doc shows the composition pattern; no agent-eval-specific adapter
12
+ code required.
13
+
14
+ ## TL;DR — one OTel context, two emitters
15
+
16
+ 1. Set up a shared OTel tracer provider in your process (or service mesh).
17
+ 2. Configure your observability tool (TraceAI / Langfuse / OpenLLMetry /
18
+ Phoenix) to register its instrumentations against that provider.
19
+ 3. Configure agent-eval's `/telemetry` exporter against the same provider.
20
+ 4. Run a campaign. Both sets of spans land at your OTel collector.
21
+ 5. Filter / route / fan-out at the collector layer — Jaeger, Tempo,
22
+ Phoenix, Langfuse cloud, your private collector, whatever.
23
+
24
+ The Tangle substrate doesn't compete with the observability tool;
25
+ they're orthogonal. The tool tells you *what your agent did*; the
26
+ substrate tells you *what the campaign / judge / mutator decided about
27
+ it*. Unified at the trace level, you see both as one timeline per cell.
28
+
29
+ ## Per-tool notes
30
+
31
+ ### TraceAI (Future-AGI)
32
+
33
+ - TS SDK auto-instruments OpenAI/Anthropic SDKs + LangChain.
34
+ - Compatible with the OpenTelemetry GenAI semantic conventions.
35
+ - Compose: register TraceAI's instrumentations on the global tracer
36
+ provider, then either point both at your OTLP collector or at
37
+ TraceAI's hosted backend if you want their UI.
38
+
39
+ ### Langfuse SDK
40
+
41
+ - Larger installed base; has its own hosted product + OSS self-host.
42
+ - Their OpenTelemetry-compatible mode ships LLM call spans with
43
+ Langfuse-specific attributes preserved.
44
+ - Compose: register Langfuse as an OTel processor; agent-eval's
45
+ campaign/judge/mutator spans appear alongside the LLM calls in their
46
+ UI.
47
+
48
+ ### OpenLLMetry (Traceloop)
49
+
50
+ - OSS auto-instrumentation library; OTel-native by design.
51
+ - Wide framework coverage (LangChain, LlamaIndex, Haystack, OpenAI,
52
+ Anthropic).
53
+ - Compose: set up Traceloop's exporter; agent-eval's exporter shares
54
+ the same trace context per cell.
55
+
56
+ ### Arize Phoenix
57
+
58
+ - OSS observability backend; strong in the eval-tooling community.
59
+ - OTel-native ingest; renders trace + span attributes per the GenAI
60
+ semantic conventions.
61
+ - Compose: point both exporters at your local Phoenix instance. Phoenix
62
+ becomes the unified UI for both LLM-call traces and campaign spans.
63
+
64
+ ## Wiring pattern (reference)
65
+
66
+ ```ts
67
+ import { trace } from '@opentelemetry/api'
68
+ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'
69
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
70
+ import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'
71
+
72
+ // 1. One shared tracer provider for the process.
73
+ const provider = new NodeTracerProvider()
74
+ provider.addSpanProcessor(new SimpleSpanProcessor(
75
+ new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' }),
76
+ ))
77
+ provider.register()
78
+
79
+ // 2. Your observability tool registers against the global provider.
80
+ // Example for TraceAI / OpenLLMetry / Langfuse — call their init.
81
+ // (See each tool's docs.)
82
+
83
+ // 3. agent-eval is already OTel-native; it picks up the same global
84
+ // provider. Just ensure `@tangle-network/agent-eval/telemetry` is
85
+ // initialized for the campaign:
86
+ import { setOtelExporter } from '@tangle-network/agent-eval/telemetry'
87
+ setOtelExporter({ kind: 'otel-global' }) // use the global provider
88
+
89
+ // 4. Run your campaign — both sets of spans land at the collector.
90
+ import { runEval } from '@tangle-network/agent-eval/contract'
91
+ await runEval({ /* ... */ })
92
+ ```
93
+
94
+ That's it. No new adapter shipping required — the libs are already
95
+ designed to live in the same OTel ecosystem.
96
+
97
+ ## When you'd want a deeper, code-level adapter
98
+
99
+ The two cases where a thin adapter would add value beyond the
100
+ OTel-protocol composition:
101
+
102
+ 1. **Cost-aware judging.** Your observability tool's auto-instrumented
103
+ spans carry token counts + cost. A custom `JudgeConfig` can read
104
+ them via the OTel context and refuse to score artifacts that
105
+ exceeded a per-call budget. Easy to write yourself; we'll ship a
106
+ reference helper (`costAwareJudgeFromOtel`) when a partner pulls on
107
+ this.
108
+ 2. **Tool-aware judging.** Your instrumentation captures the tool-call
109
+ sequence (`langchain.tool.invoked`, `openai.function.called`, etc.).
110
+ A judge that scores "did the agent use the right tool" reads those
111
+ spans directly. Also straightforward; helper ships when needed.
112
+
113
+ Both of these are L1-tier ergonomic helpers; the underlying composition
114
+ works today without them.
115
+
116
+ ## What this does NOT install
117
+
118
+ No new dependencies. No new peer deps. No `@traceai/*`, no
119
+ `@langfuse/*`, no `@opentelemetry/*` in our manifest. You bring the
120
+ observability stack you want; agent-eval just emits OTel and respects
121
+ whatever provider is registered.
@@ -32,11 +32,11 @@ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysi
32
32
 
33
33
  | Tier | What they do | What they get | Billing |
34
34
  |---|---|---|---|
35
- | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
35
+ | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) — **with optional Tangle Router as a $0-friction inference upsell.** When a builder points `OPENAI_BASE_URL` at `router.tangle.tools/v1`, every campaign call (agent + judge + reflective mutation) routes through us; we earn the routing margin. Same code, opt-in monetization vector that ships today. |
36
36
  | **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
37
37
  | **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
38
38
 
39
- The free lib casts the widest possible net at near-zero cost (it's already published). Value capture is EXPAND: hosting their data/intelligence = a billable surface on the dimensions we already meter (ingested/retained volume, eval-campaign compute, loop runs, seats). "We don't host observability unless they route to us" is the *business model*, not a gap.
39
+ The free lib casts the widest possible net at near-zero cost (it's already published). LAND is **not actually zero-revenue** pointing the loop at Tangle Router is a one-line config change with no other code differences, so we monetize inference for any LAND-tier adopter who opts in. The wedge ladder is therefore four steps: no-revenue install → router routing margin (LAND with router) → metered data hosting (EXPAND) → sandbox usage (PLATFORM). Each step a one-line config change, never a rewrite. Value capture concentrates at EXPAND (hosting their data/intelligence is the biggest billable surface), but LAND-with-router is the immediate upsell available from day one.
40
40
 
41
41
  ## Plan & gates — land-first, validate, then build
42
42