@tangle-network/agent-eval 0.45.0 → 0.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,9 @@ import {
7
7
  runEval,
8
8
  runImprovementLoop
9
9
  } from "../chunk-HRKOCLQA.js";
10
+ import {
11
+ createHostedClient
12
+ } from "../chunk-ZQABFCVJ.js";
10
13
  import {
11
14
  fsCampaignStorage,
12
15
  inMemoryCampaignStorage,
@@ -24,6 +27,207 @@ import "../chunk-VXNVVBZO.js";
24
27
  import "../chunk-PC4UYEBM.js";
25
28
  import "../chunk-QYJT52YW.js";
26
29
  import "../chunk-NSBPE2FW.js";
30
+
31
+ // src/contract/self-improve.ts
32
+ function splitTrainHoldout(scenarios, fraction) {
33
+ function hash(s) {
34
+ let h = 2166136261 >>> 0;
35
+ for (let i = 0; i < s.length; i++) {
36
+ h ^= s.charCodeAt(i);
37
+ h = Math.imul(h, 16777619) >>> 0;
38
+ }
39
+ return h;
40
+ }
41
+ const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id));
42
+ const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)));
43
+ return {
44
+ holdout: sorted.slice(0, nHoldout),
45
+ train: sorted.slice(nHoldout)
46
+ };
47
+ }
48
+ function meanComposite(byScenario) {
49
+ const perScenario = {};
50
+ const values = [];
51
+ for (const [id, agg] of Object.entries(byScenario)) {
52
+ perScenario[id] = agg.meanComposite;
53
+ values.push(agg.meanComposite);
54
+ }
55
+ return {
56
+ compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,
57
+ perScenario
58
+ };
59
+ }
60
+ var DEFAULT_MUTATION_PRIMITIVES = [
61
+ "Tighten the hook: lead with the specific user outcome.",
62
+ "Replace generic adjectives with specific verbs or proof numbers.",
63
+ "Anchor every claim in something the scenario's brief literally supports.",
64
+ "Honor the surface-shape constraint (length, register, audience vocabulary)."
65
+ ];
66
+ async function selfImprove(opts) {
67
+ const startedAt = Date.now();
68
+ const budget = opts.budget ?? {};
69
+ const generations = budget.generations ?? 3;
70
+ const populationSize = budget.populationSize ?? 2;
71
+ const maxConcurrency = budget.maxConcurrency ?? 2;
72
+ const holdoutFraction = budget.holdoutFraction ?? 0.25;
73
+ const costCeiling = budget.dollars;
74
+ const explicitHoldout = budget.holdoutScenarios;
75
+ const { train, holdout } = explicitHoldout ? {
76
+ train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),
77
+ holdout: explicitHoldout
78
+ } : splitTrainHoldout(opts.scenarios, holdoutFraction);
79
+ if (train.length === 0) {
80
+ throw new Error(
81
+ "selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios."
82
+ );
83
+ }
84
+ if (holdout.length === 0) {
85
+ throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
86
+ }
87
+ const driver = opts.driver ?? gepaDriver({
88
+ llm: {
89
+ baseUrl: opts.llm?.baseUrl ?? "https://router.tangle.tools/v1",
90
+ apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? ""
91
+ },
92
+ model: opts.llm?.model ?? "anthropic/claude-sonnet-4.6",
93
+ target: opts.driverTarget ?? "agent surface (system prompt or config) being optimized by selfImprove",
94
+ mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES
95
+ });
96
+ const gate = opts.gate ?? defaultProductionGate({
97
+ holdoutScenarios: holdout,
98
+ deltaThreshold: 0.05
99
+ });
100
+ const storage = opts.storage ?? inMemoryCampaignStorage();
101
+ const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
102
+ if (opts.onProgress) {
103
+ opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
104
+ }
105
+ const result = await runImprovementLoop({
106
+ scenarios: train,
107
+ baselineSurface: opts.baselineSurface,
108
+ dispatchWithSurface: opts.agent,
109
+ driver,
110
+ judges: [opts.judge],
111
+ populationSize,
112
+ maxGenerations: generations,
113
+ holdoutScenarios: holdout,
114
+ gate,
115
+ autoOnPromote: opts.autoOnPromote ?? "none",
116
+ ghOwner: opts.ghOwner,
117
+ ghRepo: opts.ghRepo,
118
+ storage,
119
+ runDir,
120
+ maxConcurrency,
121
+ cellPlacement: opts.cellPlacement,
122
+ costCeiling
123
+ });
124
+ const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario);
125
+ const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario);
126
+ if (opts.onProgress) {
127
+ opts.onProgress({
128
+ kind: "baseline.completed",
129
+ compositeMean: baseline.compositeMean,
130
+ durationMs: Date.now() - startedAt
131
+ });
132
+ opts.onProgress({
133
+ kind: "gate.decided",
134
+ decision: result.gateResult.decision,
135
+ lift: winnerStats.compositeMean - baseline.compositeMean
136
+ });
137
+ }
138
+ const totalCost = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
139
+ (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
140
+ 0
141
+ );
142
+ const summary = {
143
+ baseline,
144
+ winner: {
145
+ ...winnerStats,
146
+ surface: result.winnerSurface
147
+ },
148
+ lift: winnerStats.compositeMean - baseline.compositeMean,
149
+ gateDecision: result.gateResult.decision,
150
+ generationsExplored: result.generations.length,
151
+ durationMs: Date.now() - startedAt,
152
+ totalCostUsd: totalCost,
153
+ raw: result
154
+ };
155
+ if (opts.hostedTenant) {
156
+ try {
157
+ await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir);
158
+ } catch (err) {
159
+ const msg = err instanceof Error ? err.message : String(err);
160
+ console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`);
161
+ }
162
+ }
163
+ return summary;
164
+ }
165
+ async function shipEvalRunToHosted(tenant, opts, summary, raw, runDir) {
166
+ const client = createHostedClient(tenant);
167
+ function snapshotFromCampaign(index, surface, campaign, durationMs) {
168
+ const cells = campaign.cells.map((cell) => {
169
+ const judgeScores = Object.values(cell.judgeScores);
170
+ const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
171
+ return {
172
+ scenarioId: cell.scenarioId,
173
+ rep: cell.rep,
174
+ compositeMean: composite,
175
+ dimensions: Object.fromEntries(
176
+ Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions])
177
+ ),
178
+ errorMessage: cell.error ?? void 0
179
+ };
180
+ });
181
+ const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
182
+ return {
183
+ index,
184
+ surfaceHash: typeof surface === "string" ? hashString(surface) : hashString(JSON.stringify(surface ?? "")),
185
+ surface,
186
+ cells,
187
+ compositeMean,
188
+ costUsd: campaign.aggregates.totalCostUsd,
189
+ durationMs
190
+ };
191
+ }
192
+ const generations = [];
193
+ generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0));
194
+ for (const gen of raw.generations) {
195
+ const winner = gen.surfaces.reduce(
196
+ (best, s) => s.campaign.aggregates.cellsExecuted > 0 && (best === void 0 || averageComposite(s.campaign) > averageComposite(best.campaign)) ? s : best,
197
+ gen.surfaces[0]
198
+ );
199
+ if (!winner) continue;
200
+ generations.push(
201
+ snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0)
202
+ );
203
+ }
204
+ const event = {
205
+ runId: `${runDir}#${Date.now()}`,
206
+ runDir,
207
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
208
+ status: "finished",
209
+ labels: opts.hostedLabels ?? {},
210
+ baseline: generations[0],
211
+ generations,
212
+ gateDecision: summary.gateDecision,
213
+ holdoutLift: summary.lift,
214
+ totalCostUsd: summary.totalCostUsd,
215
+ totalDurationMs: summary.durationMs
216
+ };
217
+ await client.ingestEvalRun(event);
218
+ }
219
+ function averageComposite(campaign) {
220
+ const aggs = Object.values(campaign.aggregates.byScenario);
221
+ return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length;
222
+ }
223
+ function hashString(s) {
224
+ let h = 2166136261 >>> 0;
225
+ for (let i = 0; i < s.length; i++) {
226
+ h ^= s.charCodeAt(i);
227
+ h = Math.imul(h, 16777619) >>> 0;
228
+ }
229
+ return h.toString(16).padStart(8, "0");
230
+ }
27
231
  export {
28
232
  FileSystemOutcomeStore,
29
233
  InMemoryOutcomeStore,
@@ -36,6 +240,7 @@ export {
36
240
  inMemoryCampaignStorage,
37
241
  runCampaign,
38
242
  runEval,
39
- runImprovementLoop
243
+ runImprovementLoop,
244
+ selfImprove
40
245
  };
41
246
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
1
+ {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport {\n type RunImprovementLoopResult,\n runImprovementLoop,\n} from '../campaign/presets/run-improvement-loop'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\nimport { createHostedClient, type HostedTenant } from '../hosted/client'\nimport type {\n EvalRunCellScore,\n EvalRunEvent,\n EvalRunGenerationSnapshot,\n} from '../hosted/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n\n /**\n * Opt-in: ship eval-run events to a hosted orchestrator (ours, your\n * self-hosted one, or any compatible implementation of the\n * `docs/hosted-ingest-spec.md` wire format). When set, the substrate\n * POSTs the final `EvalRunEvent` to `${endpoint}/v1/ingest/eval-runs`\n * after the loop completes. Failures are logged but do not fail the\n * loop — local result is always returned.\n *\n * For our orchestrator: `{ endpoint: 'https://orchestrator.tangle.tools/v1', apiKey, tenantId }`.\n *\n * For your self-hosted: any URL serving the wire format. See\n * `examples/hosted-ingest-server/` for the reference receiver.\n */\n hostedTenant?: HostedTenant\n\n /** Free-form labels attached to the hosted event (env, branch, model id,\n * etc.). Ignored when `hostedTenant` is unset. */\n hostedLabels?: Record<string, string>\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(byScenario: Record<string, { meanComposite: number }>): {\n compositeMean: number\n perScenario: Record<string, number>\n} {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n \"Anchor every claim in something the scenario's brief literally supports.\",\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error(\n 'selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.',\n )\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target:\n opts.driverTarget ??\n 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) =>\n sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n const summary: SelfImproveResult<TScenario, TArtifact> = {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n\n // Opt-in hosted ingest. Failures logged but never fail the loop — the\n // local result is always returned. This matches the wedge-doc invariant\n // that LAND-tier never blocks on EXPAND-tier infra.\n if (opts.hostedTenant) {\n try {\n await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir)\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted-ingest is best-effort\n console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`)\n }\n }\n\n return summary\n}\n\nasync function shipEvalRunToHosted<TScenario extends Scenario, TArtifact>(\n tenant: HostedTenant,\n opts: SelfImproveOptions<TScenario, TArtifact>,\n summary: SelfImproveResult<TScenario, TArtifact>,\n raw: RunImprovementLoopResult<TArtifact, TScenario>,\n runDir: string,\n): Promise<void> {\n const client = createHostedClient(tenant)\n\n function snapshotFromCampaign(\n index: number,\n surface: MutableSurface | undefined,\n campaign: RunImprovementLoopResult<TArtifact, TScenario>['baselineCampaign'],\n durationMs: number,\n ): EvalRunGenerationSnapshot {\n const cells: EvalRunCellScore[] = campaign.cells.map((cell) => {\n const judgeScores = Object.values(cell.judgeScores)\n const composite =\n judgeScores.length === 0\n ? 0\n : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length\n return {\n scenarioId: cell.scenarioId,\n rep: cell.rep,\n compositeMean: composite,\n dimensions: Object.fromEntries(\n Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions]),\n ),\n errorMessage: cell.error ?? undefined,\n }\n })\n const compositeMean =\n cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length\n return {\n index,\n surfaceHash: typeof surface === 'string' ? hashString(surface) : hashString(JSON.stringify(surface ?? '')),\n surface,\n cells,\n compositeMean,\n costUsd: campaign.aggregates.totalCostUsd,\n durationMs,\n }\n }\n\n const generations: EvalRunGenerationSnapshot[] = []\n // Baseline as generation 0.\n generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0))\n // Improvement generations as 1..N. Substrate stores per-surface campaigns\n // per generation — we summarize the WINNING surface per generation here.\n for (const gen of raw.generations) {\n const winner = gen.surfaces.reduce((best, s) =>\n s.campaign.aggregates.cellsExecuted > 0 &&\n (best === undefined || averageComposite(s.campaign) > averageComposite(best.campaign))\n ? s\n : best,\n gen.surfaces[0],\n )\n if (!winner) continue\n generations.push(\n snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0),\n )\n }\n\n const event: EvalRunEvent = {\n runId: `${runDir}#${Date.now()}`,\n runDir,\n timestamp: new Date().toISOString(),\n status: 'finished',\n labels: opts.hostedLabels ?? {},\n baseline: generations[0],\n generations,\n gateDecision: summary.gateDecision,\n holdoutLift: summary.lift,\n totalCostUsd: summary.totalCostUsd,\n totalDurationMs: summary.durationMs,\n }\n\n await client.ingestEvalRun(event)\n}\n\nfunction averageComposite(\n campaign: RunImprovementLoopResult<unknown, Scenario>['baselineCampaign'],\n): number {\n const aggs = Object.values(campaign.aggregates.byScenario)\n return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length\n}\n\nfunction hashString(s: string): string {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h.toString(16).padStart(8, '0')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAsNA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cAAc,YAGrB;AACA,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QACE,KAAK,gBACL;AAAA,IACF,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QACJ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IACjF;AAAA,EACF;AAEF,QAAM,UAAmD;AAAA,IACvD;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AAKA,MAAI,KAAK,cAAc;AACrB,QAAI;AACF,YAAM,oBAAoB,KAAK,cAAc,MAAM,SAAS,QAAQ,MAAM;AAAA,IAC5E,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,mDAAmD,GAAG,EAAE;AAAA,IACvE;AAAA,EACF;AAEA,SAAO;AACT;AAEA,eAAe,oBACb,QACA,MACA,SACA,KACA,QACe;AACf,QAAM,SAAS,mBAAmB,MAAM;AAExC,WAAS,qBACP,OACA,SACA,UACA,YAC2B;AAC3B,UAAM,QAA4B,SAAS,MAAM,IAAI,CAAC,SAAS;AAC7D,YAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,YAAM,YACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,aAAO;AAAA,QACL,YAAY,KAAK;AAAA,QACjB,KAAK,KAAK;AAAA,QACV,eAAe;AAAA,QACf,YAAY,OAAO;AAAA,UACjB,OAAO,QAAQ,KAAK,WAAW,EAAE,IAAI,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,MAAM,UAAU,CAAC;AAAA,QAClF;AAAA,QACA,cAAc,KAAK,SAAS;AAAA,MAC9B;AAAA,IACF,CAAC;AACD,UAAM,gBACJ,MAAM,WAAW,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,MAAM;AAClF,WAAO;AAAA,MACL;AAAA,MACA,aAAa,OAAO,YAAY,WAAW,WAAW,OAAO,IAAI,WAAW,KAAK,UAAU,WAAW,EAAE,CAAC;AAAA,MACzG;AAAA,MACA;AAAA,MACA;AAAA,MACA,SAAS,SAAS,WAAW;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,cAA2C,CAAC;AAElD,cAAY,KAAK,qBAAqB,GAAG,KAAK,iBAAiB,IAAI,kBAAkB,CAAC,CAAC;AAGvF,aAAW,OAAO,IAAI,aAAa;AACjC,UAAM,SAAS,IAAI,SAAS;AAAA,MAAO,CAAC,MAAM,MACxC,EAAE,SAAS,WAAW,gBAAgB,MACrC,SAAS,UAAa,iBAAiB,EAAE,QAAQ,IAAI,iBAAiB,KAAK,QAAQ,KAChF,IACA;AAAA,MACJ,IAAI,SAAS,CAAC;AAAA,IAChB;AACA,QAAI,CAAC,OAAQ;AACb,gBAAY;AAAA,MACV,qBAAqB,IAAI,OAAO,kBAAkB,GAAG,OAAO,SAAS,OAAO,UAAU,CAAC;AAAA,IACzF;AAAA,EACF;AAEA,QAAM,QAAsB;AAAA,IAC1B,OAAO,GAAG,MAAM,IAAI,KAAK,IAAI,CAAC;AAAA,IAC9B;AAAA,IACA,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,IAClC,QAAQ;AAAA,IACR,QAAQ,KAAK,gBAAgB,CAAC;AAAA,IAC9B,UAAU,YAAY,CAAC;AAAA,IACvB;AAAA,IACA,cAAc,QAAQ;AAAA,IACtB,aAAa,QAAQ;AAAA,IACrB,cAAc,QAAQ;AAAA,IACtB,iBAAiB,QAAQ;AAAA,EAC3B;AAEA,QAAM,OAAO,cAAc,KAAK;AAClC;AAEA,SAAS,iBACP,UACQ;AACR,QAAM,OAAO,OAAO,OAAO,SAAS,WAAW,UAAU;AACzD,SAAO,KAAK,WAAW,IAAI,IAAI,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,KAAK;AACtF;AAEA,SAAS,WAAW,GAAmB;AACrC,MAAI,IAAI,eAAe;AACvB,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,SAAK,EAAE,WAAW,CAAC;AACnB,QAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,EACjC;AACA,SAAO,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG;AACvC;","names":[]}
@@ -0,0 +1,192 @@
1
+ import { M as MutableSurface, i as GateDecision } from '../types-8u72Gc76.js';
2
+
3
+ /**
4
+ * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,
5
+ * a partner's self-hosted one, a future open implementation) must accept.
6
+ *
7
+ * **Stability:** every type in this file is committed under semver. New
8
+ * minors only ADD optional fields. Breaking changes mean a major bump
9
+ * (`HostedWireVersion` literal increment).
10
+ *
11
+ * The wire format is two event streams in one transport:
12
+ *
13
+ * 1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a
14
+ * campaign / improvement-loop completes (or per-generation if
15
+ * streaming). Carries the structured result + per-cell scores +
16
+ * surface diffs the orchestrator stores for the dashboard.
17
+ *
18
+ * 2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped
19
+ * spans with a few additional attributes so the orchestrator can
20
+ * pivot from eval-run → underlying execution. Compatible with any
21
+ * OTel collector.
22
+ *
23
+ * Both endpoints are authenticated with a bearer token + a tenant id
24
+ * header. Tenants isolate everything downstream of ingest; no tenant
25
+ * ever sees another tenant's data.
26
+ */
27
+
28
+ declare const HOSTED_WIRE_VERSION: "2026-05-26.v1";
29
+ type HostedWireVersion = typeof HOSTED_WIRE_VERSION;
30
+ /** Every ingest request carries these. */
31
+ interface HostedIngestHeaders {
32
+ /** Bearer token. The orchestrator validates against the tenant key. */
33
+ authorization: `Bearer ${string}`;
34
+ /** Stable tenant id (the orchestrator-side primary key for the tenant). */
35
+ 'x-tangle-tenant-id': string;
36
+ /** Wire-version pin so the server can reject incompatible payloads. */
37
+ 'x-tangle-wire-version': HostedWireVersion;
38
+ /** Optional idempotency key for retry-safe ingest. */
39
+ 'idempotency-key'?: string;
40
+ }
41
+ /** Lifecycle stages of an eval-run as the substrate reports them. */
42
+ type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored';
43
+ interface EvalRunCellScore {
44
+ /** Stable scenario id from the consumer's scenario set. */
45
+ scenarioId: string;
46
+ /** Repetition index when reps > 1; 0 for the default. */
47
+ rep: number;
48
+ /** Composite score across all judges + dimensions for this cell. */
49
+ compositeMean: number;
50
+ /** Per-judge → per-dimension scores; null where the judge did not run. */
51
+ dimensions: Record<string, Record<string, number>>;
52
+ /** Per-cell error message if the dispatch threw. Null on success. */
53
+ errorMessage?: string;
54
+ }
55
+ interface EvalRunGenerationSnapshot {
56
+ /** Generation index. 0 is baseline. */
57
+ index: number;
58
+ /** Candidate surface fingerprint (stable hash) — pivot key into the
59
+ * trace stream to fetch the underlying execution. */
60
+ surfaceHash: string;
61
+ /** The candidate surface itself. May be omitted to avoid PII when the
62
+ * consumer prefers not to ship verbatim prompts. */
63
+ surface?: MutableSurface;
64
+ /** Per-cell scores for this generation. */
65
+ cells: EvalRunCellScore[];
66
+ /** Aggregate composite mean across all cells in this generation. */
67
+ compositeMean: number;
68
+ /** Total $ spent across this generation. */
69
+ costUsd: number;
70
+ /** Wall-clock duration of this generation. */
71
+ durationMs: number;
72
+ }
73
+ /**
74
+ * The top-level eval-run event. One ingest call per logical eval-run;
75
+ * generations stream in incrementally via repeated calls with the same
76
+ * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.
77
+ */
78
+ interface EvalRunEvent {
79
+ /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */
80
+ runId: string;
81
+ /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */
82
+ runDir: string;
83
+ /** ISO-8601 timestamp the substrate recorded the event. */
84
+ timestamp: string;
85
+ /** Lifecycle stage this event represents. */
86
+ status: EvalRunStatus;
87
+ /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */
88
+ labels: Record<string, string>;
89
+ /** Baseline campaign snapshot. Present when status >= baseline-complete. */
90
+ baseline?: EvalRunGenerationSnapshot;
91
+ /** Per-generation snapshots. Streams in; orchestrator appends. */
92
+ generations: EvalRunGenerationSnapshot[];
93
+ /** Final gate decision. Present when status >= gate-decided. */
94
+ gateDecision?: GateDecision;
95
+ /** Held-out lift = winner-on-holdout - baseline-on-holdout. */
96
+ holdoutLift?: number;
97
+ /** Total $ spent across baseline + every generation. */
98
+ totalCostUsd: number;
99
+ /** Total wall-clock duration. */
100
+ totalDurationMs: number;
101
+ /** Error message if status === 'errored'. */
102
+ errorMessage?: string;
103
+ }
104
+ /**
105
+ * OTel-shape span with a few additional attributes for eval-run pivoting.
106
+ * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,
107
+ * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.
108
+ */
109
+ interface TraceSpanEvent {
110
+ traceId: string;
111
+ spanId: string;
112
+ parentSpanId?: string;
113
+ name: string;
114
+ startTimeUnixNano: number;
115
+ endTimeUnixNano: number;
116
+ attributes: Record<string, string | number | boolean>;
117
+ events?: Array<{
118
+ timeUnixNano: number;
119
+ name: string;
120
+ attributes?: Record<string, string | number | boolean>;
121
+ }>;
122
+ status?: {
123
+ code: 'OK' | 'ERROR' | 'UNSET';
124
+ message?: string;
125
+ };
126
+ /** Pivot back into the eval-run stream. */
127
+ 'tangle.runId'?: string;
128
+ /** Pivot to the specific generation. */
129
+ 'tangle.generation'?: number;
130
+ /** Pivot to the specific cell. */
131
+ 'tangle.cellId'?: string;
132
+ /** Pivot to the specific scenario. */
133
+ 'tangle.scenarioId'?: string;
134
+ }
135
+ interface IngestEvalRunsRequest {
136
+ wireVersion: HostedWireVersion;
137
+ events: EvalRunEvent[];
138
+ }
139
+ interface IngestTracesRequest {
140
+ wireVersion: HostedWireVersion;
141
+ spans: TraceSpanEvent[];
142
+ }
143
+ interface IngestResponse {
144
+ /** Accepted events / spans count. */
145
+ accepted: number;
146
+ /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */
147
+ rejected: Array<{
148
+ index: number;
149
+ reason: string;
150
+ }>;
151
+ }
152
+
153
+ /**
154
+ * # Hosted-tier ingest client.
155
+ *
156
+ * Ships eval-run events + trace spans to any orchestrator (ours, a
157
+ * partner's self-hosted one, or a future open implementation) that
158
+ * speaks the wire format in `./types.ts`.
159
+ *
160
+ * Three modes:
161
+ * - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We
162
+ * handle ingest + storage + dashboard.
163
+ * - **Self-hosted:** point at whatever URL runs the reference receiver
164
+ * from `examples/hosted-ingest-server/`.
165
+ * - **Off (default):** when `hostedTenant` is unset, nothing is sent.
166
+ * Everything stays local.
167
+ */
168
+
169
+ interface HostedTenant {
170
+ /** Orchestrator endpoint base URL (no trailing slash). Required. */
171
+ endpoint: string;
172
+ /** Bearer token issued by the orchestrator. Required. */
173
+ apiKey: string;
174
+ /** Tenant id — the orchestrator's primary key for this consumer. Required. */
175
+ tenantId: string;
176
+ /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */
177
+ fetchImpl?: typeof fetch;
178
+ /** Per-call timeout in ms. Default 30s. */
179
+ timeoutMs?: number;
180
+ /** Retries on 5xx / network errors. Default 2. */
181
+ retries?: number;
182
+ }
183
+ interface HostedClient {
184
+ ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>;
185
+ ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>;
186
+ ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>;
187
+ readonly tenant: HostedTenant;
188
+ readonly wireVersion: HostedWireVersion;
189
+ }
190
+ declare function createHostedClient(tenant: HostedTenant): HostedClient;
191
+
192
+ export { type EvalRunCellScore, type EvalRunEvent, type EvalRunGenerationSnapshot, type EvalRunStatus, HOSTED_WIRE_VERSION, type HostedClient, type HostedIngestHeaders, type HostedTenant, type HostedWireVersion, type IngestEvalRunsRequest, type IngestResponse, type IngestTracesRequest, type TraceSpanEvent, createHostedClient };
@@ -0,0 +1,10 @@
1
+ import {
2
+ HOSTED_WIRE_VERSION,
3
+ createHostedClient
4
+ } from "../chunk-ZQABFCVJ.js";
5
+ import "../chunk-NSBPE2FW.js";
6
+ export {
7
+ HOSTED_WIRE_VERSION,
8
+ createHostedClient
9
+ };
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.44.1",
5
+ "version": "0.46.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
package/dist/rl.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { d as CampaignResult } from './types-BURGZ8Ug.js';
2
+ import { d as CampaignResult } from './types-8u72Gc76.js';
3
3
  import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
4
  export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
@@ -1,4 +1,4 @@
1
- import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-BURGZ8Ug.js';
1
+ import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { RunRecord } from '@tangle-network/agent-runtime';
4
4
  import { R as RedTeamCase } from './red-team-30II1T4o.js';
@@ -414,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
414
414
  }
415
415
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
416
416
 
417
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type RunEvalOptions as a, type RunImprovementLoopOptions as b, type RunImprovementLoopResult as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
417
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
372
372
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
373
373
  }
374
374
 
375
- export type { CampaignAggregates as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchContext as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
375
+ export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
@@ -32,11 +32,11 @@ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysi
32
32
 
33
33
  | Tier | What they do | What they get | Billing |
34
34
  |---|---|---|---|
35
- | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
35
+ | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) — **with optional Tangle Router as a $0-friction inference upsell.** When a builder points `OPENAI_BASE_URL` at `router.tangle.tools/v1`, every campaign call (agent + judge + reflective mutation) routes through us; we earn the routing margin. Same code, opt-in monetization vector that ships today. |
36
36
  | **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
37
37
  | **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
38
38
 
39
- The free lib casts the widest possible net at near-zero cost (it's already published). Value capture is EXPAND: hosting their data/intelligence = a billable surface on the dimensions we already meter (ingested/retained volume, eval-campaign compute, loop runs, seats). "We don't host observability unless they route to us" is the *business model*, not a gap.
39
+ The free lib casts the widest possible net at near-zero cost (it's already published). LAND is **not actually zero-revenue** pointing the loop at Tangle Router is a one-line config change with no other code differences, so we monetize inference for any LAND-tier adopter who opts in. The wedge ladder is therefore four steps: no-revenue install → router routing margin (LAND with router) → metered data hosting (EXPAND) → sandbox usage (PLATFORM). Each step a one-line config change, never a rewrite. Value capture concentrates at EXPAND (hosting their data/intelligence is the biggest billable surface), but LAND-with-router is the immediate upsell available from day one.
40
40
 
41
41
  ## Plan & gates — land-first, validate, then build
42
42