@tangle-network/agent-eval 0.46.0 → 0.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+ // src/hosted/types.ts
2
+ var HOSTED_WIRE_VERSION = "2026-05-26.v1";
3
+
4
+ // src/hosted/client.ts
5
+ function sleep(ms) {
6
+ return new Promise((resolve) => {
7
+ const t = setTimeout(resolve, ms);
8
+ if (typeof t.unref === "function") t.unref();
9
+ });
10
+ }
11
+ async function post(tenant, path, body, opts = {}) {
12
+ const timeoutMs = tenant.timeoutMs ?? 3e4;
13
+ const maxRetries = tenant.retries ?? 2;
14
+ const f = tenant.fetchImpl ?? ((...args) => fetch(...args));
15
+ const url = `${tenant.endpoint.replace(/\/$/, "")}${path}`;
16
+ let lastError;
17
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
18
+ const ourTimeout = AbortSignal.timeout(timeoutMs);
19
+ const combinedSignal = opts.signal ? AbortSignal.any([opts.signal, ourTimeout]) : ourTimeout;
20
+ try {
21
+ const headers = {
22
+ "content-type": "application/json",
23
+ authorization: `Bearer ${tenant.apiKey}`,
24
+ "x-tangle-tenant-id": tenant.tenantId,
25
+ "x-tangle-wire-version": HOSTED_WIRE_VERSION
26
+ };
27
+ if (opts.idempotencyKey) headers["idempotency-key"] = opts.idempotencyKey;
28
+ const res = await f(url, {
29
+ method: "POST",
30
+ headers,
31
+ body: JSON.stringify(body),
32
+ signal: combinedSignal
33
+ });
34
+ if (!res.ok) {
35
+ const retryable = res.status >= 500 || res.status === 408 || res.status === 429;
36
+ if (!retryable || attempt === maxRetries) {
37
+ const text = await res.text().catch(() => "");
38
+ throw new Error(`hosted ingest ${url} failed (${res.status}): ${text.slice(0, 500)}`);
39
+ }
40
+ await sleep(2 ** attempt * 200 + Math.random() * 200);
41
+ continue;
42
+ }
43
+ return await res.json();
44
+ } catch (err) {
45
+ if (opts.signal?.aborted) throw err;
46
+ lastError = err;
47
+ if (attempt === maxRetries) throw err;
48
+ await sleep(2 ** attempt * 200 + Math.random() * 200);
49
+ }
50
+ }
51
+ throw lastError ?? new Error("hosted ingest exhausted retries");
52
+ }
53
+ function createHostedClient(tenant) {
54
+ return {
55
+ tenant,
56
+ wireVersion: HOSTED_WIRE_VERSION,
57
+ async ingestEvalRun(event, idempotencyKey) {
58
+ return this.ingestEvalRuns([event], idempotencyKey);
59
+ },
60
+ async ingestEvalRuns(events, idempotencyKey) {
61
+ const body = { wireVersion: HOSTED_WIRE_VERSION, events };
62
+ return post(
63
+ tenant,
64
+ "/v1/ingest/eval-runs",
65
+ body,
66
+ { idempotencyKey }
67
+ );
68
+ },
69
+ async ingestTraces(spans, idempotencyKey) {
70
+ const body = { wireVersion: HOSTED_WIRE_VERSION, spans };
71
+ return post(
72
+ tenant,
73
+ "/v1/ingest/traces",
74
+ body,
75
+ { idempotencyKey }
76
+ );
77
+ }
78
+ };
79
+ }
80
+
81
+ export {
82
+ HOSTED_WIRE_VERSION,
83
+ createHostedClient
84
+ };
85
+ //# sourceMappingURL=chunk-ZQABFCVJ.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/hosted/types.ts","../src/hosted/client.ts"],"sourcesContent":["/**\n * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,\n * a partner's self-hosted one, a future open implementation) must accept.\n *\n * **Stability:** every type in this file is committed under semver. New\n * minors only ADD optional fields. Breaking changes mean a major bump\n * (`HostedWireVersion` literal increment).\n *\n * The wire format is two event streams in one transport:\n *\n * 1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a\n * campaign / improvement-loop completes (or per-generation if\n * streaming). Carries the structured result + per-cell scores +\n * surface diffs the orchestrator stores for the dashboard.\n *\n * 2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped\n * spans with a few additional attributes so the orchestrator can\n * pivot from eval-run → underlying execution. Compatible with any\n * OTel collector.\n *\n * Both endpoints are authenticated with a bearer token + a tenant id\n * header. Tenants isolate everything downstream of ingest; no tenant\n * ever sees another tenant's data.\n */\n\nimport type { GateDecision, MutableSurface } from '../campaign/types'\n\nexport const HOSTED_WIRE_VERSION = '2026-05-26.v1' as const\nexport type HostedWireVersion = typeof HOSTED_WIRE_VERSION\n\n// ── Transport headers ───────────────────────────────────────────────\n\n/** Every ingest request carries these. */\nexport interface HostedIngestHeaders {\n /** Bearer token. The orchestrator validates against the tenant key. */\n authorization: `Bearer ${string}`\n /** Stable tenant id (the orchestrator-side primary key for the tenant). */\n 'x-tangle-tenant-id': string\n /** Wire-version pin so the server can reject incompatible payloads. */\n 'x-tangle-wire-version': HostedWireVersion\n /** Optional idempotency key for retry-safe ingest. */\n 'idempotency-key'?: string\n}\n\n// ── Eval-run event ──────────────────────────────────────────────────\n\n/** Lifecycle stages of an eval-run as the substrate reports them. */\nexport type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored'\n\nexport interface EvalRunCellScore {\n /** Stable scenario id from the consumer's scenario set. */\n scenarioId: string\n /** Repetition index when reps > 1; 0 for the default. */\n rep: number\n /** Composite score across all judges + dimensions for this cell. */\n compositeMean: number\n /** Per-judge → per-dimension scores; null where the judge did not run. */\n dimensions: Record<string, Record<string, number>>\n /** Per-cell error message if the dispatch threw. Null on success. */\n errorMessage?: string\n}\n\nexport interface EvalRunGenerationSnapshot {\n /** Generation index. 0 is baseline. */\n index: number\n /** Candidate surface fingerprint (stable hash) — pivot key into the\n * trace stream to fetch the underlying execution. */\n surfaceHash: string\n /** The candidate surface itself. May be omitted to avoid PII when the\n * consumer prefers not to ship verbatim prompts. */\n surface?: MutableSurface\n /** Per-cell scores for this generation. */\n cells: EvalRunCellScore[]\n /** Aggregate composite mean across all cells in this generation. */\n compositeMean: number\n /** Total $ spent across this generation. */\n costUsd: number\n /** Wall-clock duration of this generation. */\n durationMs: number\n}\n\n/**\n * The top-level eval-run event. One ingest call per logical eval-run;\n * generations stream in incrementally via repeated calls with the same\n * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.\n */\nexport interface EvalRunEvent {\n /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */\n runId: string\n /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */\n runDir: string\n /** ISO-8601 timestamp the substrate recorded the event. */\n timestamp: string\n /** Lifecycle stage this event represents. */\n status: EvalRunStatus\n /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */\n labels: Record<string, string>\n /** Baseline campaign snapshot. Present when status >= baseline-complete. */\n baseline?: EvalRunGenerationSnapshot\n /** Per-generation snapshots. Streams in; orchestrator appends. */\n generations: EvalRunGenerationSnapshot[]\n /** Final gate decision. Present when status >= gate-decided. */\n gateDecision?: GateDecision\n /** Held-out lift = winner-on-holdout - baseline-on-holdout. */\n holdoutLift?: number\n /** Total $ spent across baseline + every generation. */\n totalCostUsd: number\n /** Total wall-clock duration. */\n totalDurationMs: number\n /** Error message if status === 'errored'. */\n errorMessage?: string\n}\n\n// ── Trace span event ────────────────────────────────────────────────\n\n/**\n * OTel-shape span with a few additional attributes for eval-run pivoting.\n * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,\n * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.\n */\nexport interface TraceSpanEvent {\n traceId: string\n spanId: string\n parentSpanId?: string\n name: string\n startTimeUnixNano: number\n endTimeUnixNano: number\n attributes: Record<string, string | number | boolean>\n events?: Array<{ timeUnixNano: number; name: string; attributes?: Record<string, string | number | boolean> }>\n status?: { code: 'OK' | 'ERROR' | 'UNSET'; message?: string }\n /** Pivot back into the eval-run stream. */\n 'tangle.runId'?: string\n /** Pivot to the specific generation. */\n 'tangle.generation'?: number\n /** Pivot to the specific cell. */\n 'tangle.cellId'?: string\n /** Pivot to the specific scenario. */\n 'tangle.scenarioId'?: string\n}\n\n// ── Ingest request bodies ───────────────────────────────────────────\n\nexport interface IngestEvalRunsRequest {\n wireVersion: HostedWireVersion\n events: EvalRunEvent[]\n}\n\nexport interface IngestTracesRequest {\n wireVersion: HostedWireVersion\n spans: TraceSpanEvent[]\n}\n\nexport interface IngestResponse {\n /** Accepted events / spans count. */\n accepted: number\n /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */\n rejected: Array<{ index: number; reason: string }>\n}\n","/**\n * # Hosted-tier ingest client.\n *\n * Ships eval-run events + trace spans to any orchestrator (ours, a\n * partner's self-hosted one, or a future open implementation) that\n * speaks the wire format in `./types.ts`.\n *\n * Three modes:\n * - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We\n * handle ingest + storage + dashboard.\n * - **Self-hosted:** point at whatever URL runs the reference receiver\n * from `examples/hosted-ingest-server/`.\n * - **Off (default):** when `hostedTenant` is unset, nothing is sent.\n * Everything stays local.\n */\n\nimport {\n HOSTED_WIRE_VERSION,\n type EvalRunEvent,\n type HostedWireVersion,\n type IngestEvalRunsRequest,\n type IngestResponse,\n type IngestTracesRequest,\n type TraceSpanEvent,\n} from './types'\n\nexport interface HostedTenant {\n /** Orchestrator endpoint base URL (no trailing slash). Required. */\n endpoint: string\n /** Bearer token issued by the orchestrator. Required. */\n apiKey: string\n /** Tenant id — the orchestrator's primary key for this consumer. Required. */\n tenantId: string\n /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */\n fetchImpl?: typeof fetch\n /** Per-call timeout in ms. Default 30s. */\n timeoutMs?: number\n /** Retries on 5xx / network errors. Default 2. */\n retries?: number\n}\n\nexport interface HostedClient {\n ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>\n ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>\n ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>\n readonly tenant: HostedTenant\n readonly wireVersion: HostedWireVersion\n}\n\ninterface RequestOptions {\n idempotencyKey?: string\n signal?: AbortSignal\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n if (typeof (t as { unref?: () => void }).unref === 'function') (t as { unref: () => void }).unref()\n })\n}\n\nasync function post<TReq, TRes>(\n tenant: HostedTenant,\n path: string,\n body: TReq,\n opts: RequestOptions = {},\n): Promise<TRes> {\n const timeoutMs = tenant.timeoutMs ?? 30_000\n const maxRetries = tenant.retries ?? 2\n const f: typeof fetch = tenant.fetchImpl ?? ((...args) => fetch(...args))\n const url = `${tenant.endpoint.replace(/\\/$/, '')}${path}`\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = opts.signal ? AbortSignal.any([opts.signal, ourTimeout]) : ourTimeout\n try {\n const headers: Record<string, string> = {\n 'content-type': 'application/json',\n authorization: `Bearer ${tenant.apiKey}`,\n 'x-tangle-tenant-id': tenant.tenantId,\n 'x-tangle-wire-version': HOSTED_WIRE_VERSION,\n }\n if (opts.idempotencyKey) headers['idempotency-key'] = opts.idempotencyKey\n\n const res = await f(url, {\n method: 'POST',\n headers,\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`hosted ingest ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n return (await res.json()) as TRes\n } catch (err) {\n if (opts.signal?.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('hosted ingest exhausted retries')\n}\n\nexport function createHostedClient(tenant: HostedTenant): HostedClient {\n return {\n tenant,\n wireVersion: HOSTED_WIRE_VERSION,\n\n async ingestEvalRun(event, idempotencyKey) {\n return this.ingestEvalRuns([event], idempotencyKey)\n },\n\n async ingestEvalRuns(events, idempotencyKey) {\n const body: IngestEvalRunsRequest = { wireVersion: HOSTED_WIRE_VERSION, events }\n return post<IngestEvalRunsRequest, IngestResponse>(\n tenant,\n '/v1/ingest/eval-runs',\n body,\n { idempotencyKey },\n )\n },\n\n async ingestTraces(spans, idempotencyKey) {\n const body: IngestTracesRequest = { wireVersion: HOSTED_WIRE_VERSION, spans }\n return post<IngestTracesRequest, IngestResponse>(\n tenant,\n '/v1/ingest/traces',\n body,\n { idempotencyKey },\n )\n },\n }\n}\n"],"mappings":";AA2BO,IAAM,sBAAsB;;;AC2BnC,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAChC,QAAI,OAAQ,EAA6B,UAAU,WAAY,CAAC,EAA4B,MAAM;AAAA,EACpG,CAAC;AACH;AAEA,eAAe,KACb,QACA,MACA,MACA,OAAuB,CAAC,GACT;AACf,QAAM,YAAY,OAAO,aAAa;AACtC,QAAM,aAAa,OAAO,WAAW;AACrC,QAAM,IAAkB,OAAO,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AACvE,QAAM,MAAM,GAAG,OAAO,SAAS,QAAQ,OAAO,EAAE,CAAC,GAAG,IAAI;AAExD,MAAI;AACJ,WAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AACtD,UAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,UAAM,iBAAiB,KAAK,SAAS,YAAY,IAAI,CAAC,KAAK,QAAQ,UAAU,CAAC,IAAI;AAClF,QAAI;AACF,YAAM,UAAkC;AAAA,QACtC,gBAAgB;AAAA,QAChB,eAAe,UAAU,OAAO,MAAM;AAAA,QACtC,sBAAsB,OAAO;AAAA,QAC7B,yBAAyB;AAAA,MAC3B;AACA,UAAI,KAAK,eAAgB,SAAQ,iBAAiB,IAAI,KAAK;AAE3D,YAAM,MAAM,MAAM,EAAE,KAAK;AAAA,QACvB,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU,IAAI;AAAA,QACzB,QAAQ;AAAA,MACV,CAAC;AACD,UAAI,CAAC,IAAI,IAAI;AACX,cAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,YAAI,CAAC,aAAa,YAAY,YAAY;AACxC,gBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,gBAAM,IAAI,MAAM,iBAAiB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,QACtF;AACA,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,MACF;AACA,aAAQ,MAAM,IAAI,KAAK;AAAA,IACzB,SAAS,KAAK;AACZ,UAAI,KAAK,QAAQ,QAAS,OAAM;AAChC,kBAAY;AACZ,UAAI,YAAY,WAAY,OAAM;AAClC,YAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,IACtD;AAAA,EACF;AACA,QAAM,aAAa,IAAI,MAAM,iCAAiC;AAChE;AAEO,SAAS,mBAAmB,QAAoC;AACrE,SAAO;AAAA,IACL;AAAA,IACA,aAAa;AAAA,IAEb,MAAM,cAAc,OAAO,gBAAgB;AACzC,aAAO,KAAK,eAAe,CAAC,KAAK,GAAG,cAAc;AAAA,IACpD;AAAA,IAEA,MAAM,eAAe,QAAQ,gBAAgB;AAC3C,YAAM,OAA8B,EAAE,aAAa,qBAAqB,OAAO;AAC/E,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA;AAAA,QACA,EAAE,eAAe;AAAA,MACnB;AAAA,IACF;AAAA,IAEA,MAAM,aAAa,OAAO,gBAAgB;AACxC,YAAM,OAA4B,EAAE,aAAa,qBAAqB,MAAM;AAC5E,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA;AAAA,QACA,EAAE,eAAe;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
@@ -3,6 +3,7 @@ export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCell
3
3
  import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bfam3MT1.js';
4
4
  export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-Bfam3MT1.js';
5
5
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
6
+ import { HostedTenant } from '../hosted/index.js';
6
7
  import '../llm-client-BXVRUZyX.js';
7
8
  import '../errors-mje_cKOs.js';
8
9
  import '../raw-provider-sink-C46HDghv.js';
@@ -145,6 +146,23 @@ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
145
146
  autoOnPromote?: 'pr' | 'none';
146
147
  ghOwner?: string;
147
148
  ghRepo?: string;
149
+ /**
150
+ * Opt-in: ship eval-run events to a hosted orchestrator (ours, your
151
+ * self-hosted one, or any compatible implementation of the
152
+ * `docs/hosted-ingest-spec.md` wire format). When set, the substrate
153
+ * POSTs the final `EvalRunEvent` to `${endpoint}/v1/ingest/eval-runs`
154
+ * after the loop completes. Failures are logged but do not fail the
155
+ * loop — local result is always returned.
156
+ *
157
+ * For our orchestrator: `{ endpoint: 'https://orchestrator.tangle.tools/v1', apiKey, tenantId }`.
158
+ *
159
+ * For your self-hosted: any URL serving the wire format. See
160
+ * `examples/hosted-ingest-server/` for the reference receiver.
161
+ */
162
+ hostedTenant?: HostedTenant;
163
+ /** Free-form labels attached to the hosted event (env, branch, model id,
164
+ * etc.). Ignored when `hostedTenant` is unset. */
165
+ hostedLabels?: Record<string, string>;
148
166
  }
149
167
  interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
150
168
  /** Composite mean across all scenarios, baseline run. */
@@ -7,6 +7,9 @@ import {
7
7
  runEval,
8
8
  runImprovementLoop
9
9
  } from "../chunk-HRKOCLQA.js";
10
+ import {
11
+ createHostedClient
12
+ } from "../chunk-ZQABFCVJ.js";
10
13
  import {
11
14
  fsCampaignStorage,
12
15
  inMemoryCampaignStorage,
@@ -74,7 +77,9 @@ async function selfImprove(opts) {
74
77
  holdout: explicitHoldout
75
78
  } : splitTrainHoldout(opts.scenarios, holdoutFraction);
76
79
  if (train.length === 0) {
77
- throw new Error("selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.");
80
+ throw new Error(
81
+ "selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios."
82
+ );
78
83
  }
79
84
  if (holdout.length === 0) {
80
85
  throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
@@ -134,7 +139,7 @@ async function selfImprove(opts) {
134
139
  (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
135
140
  0
136
141
  );
137
- return {
142
+ const summary = {
138
143
  baseline,
139
144
  winner: {
140
145
  ...winnerStats,
@@ -147,6 +152,81 @@ async function selfImprove(opts) {
147
152
  totalCostUsd: totalCost,
148
153
  raw: result
149
154
  };
155
+ if (opts.hostedTenant) {
156
+ try {
157
+ await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir);
158
+ } catch (err) {
159
+ const msg = err instanceof Error ? err.message : String(err);
160
+ console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`);
161
+ }
162
+ }
163
+ return summary;
164
+ }
165
+ async function shipEvalRunToHosted(tenant, opts, summary, raw, runDir) {
166
+ const client = createHostedClient(tenant);
167
+ function snapshotFromCampaign(index, surface, campaign, durationMs) {
168
+ const cells = campaign.cells.map((cell) => {
169
+ const judgeScores = Object.values(cell.judgeScores);
170
+ const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
171
+ return {
172
+ scenarioId: cell.scenarioId,
173
+ rep: cell.rep,
174
+ compositeMean: composite,
175
+ dimensions: Object.fromEntries(
176
+ Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions])
177
+ ),
178
+ errorMessage: cell.error ?? void 0
179
+ };
180
+ });
181
+ const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
182
+ return {
183
+ index,
184
+ surfaceHash: typeof surface === "string" ? hashString(surface) : hashString(JSON.stringify(surface ?? "")),
185
+ surface,
186
+ cells,
187
+ compositeMean,
188
+ costUsd: campaign.aggregates.totalCostUsd,
189
+ durationMs
190
+ };
191
+ }
192
+ const generations = [];
193
+ generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0));
194
+ for (const gen of raw.generations) {
195
+ const winner = gen.surfaces.reduce(
196
+ (best, s) => s.campaign.aggregates.cellsExecuted > 0 && (best === void 0 || averageComposite(s.campaign) > averageComposite(best.campaign)) ? s : best,
197
+ gen.surfaces[0]
198
+ );
199
+ if (!winner) continue;
200
+ generations.push(
201
+ snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0)
202
+ );
203
+ }
204
+ const event = {
205
+ runId: `${runDir}#${Date.now()}`,
206
+ runDir,
207
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
208
+ status: "finished",
209
+ labels: opts.hostedLabels ?? {},
210
+ baseline: generations[0],
211
+ generations,
212
+ gateDecision: summary.gateDecision,
213
+ holdoutLift: summary.lift,
214
+ totalCostUsd: summary.totalCostUsd,
215
+ totalDurationMs: summary.durationMs
216
+ };
217
+ await client.ingestEvalRun(event);
218
+ }
219
+ function averageComposite(campaign) {
220
+ const aggs = Object.values(campaign.aggregates.byScenario);
221
+ return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length;
222
+ }
223
+ function hashString(s) {
224
+ let h = 2166136261 >>> 0;
225
+ for (let i = 0; i < s.length; i++) {
226
+ h ^= s.charCodeAt(i);
227
+ h = Math.imul(h, 16777619) >>> 0;
228
+ }
229
+ return h.toString(16).padStart(8, "0");
150
230
  }
151
231
  export {
152
232
  FileSystemOutcomeStore,
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { runImprovementLoop, type RunImprovementLoopResult } from '../campaign/presets/run-improvement-loop'\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: DispatchContext,\n ) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(\n byScenario: Record<string, { meanComposite: number }>,\n): { compositeMean: number; perScenario: Record<string, number> } {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n 'Anchor every claim in something the scenario\\'s brief literally supports.',\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error('selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.')\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target: opts.driverTarget ?? 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n return {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8LA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cACP,YACgE;AAChE,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,mFAAmF;AAAA,EACrG;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QAAQ,KAAK,gBAAgB;AAAA,IAC7B,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QAAQ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IAC7F;AAAA,EACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport {\n type RunImprovementLoopResult,\n runImprovementLoop,\n} from '../campaign/presets/run-improvement-loop'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\nimport { createHostedClient, type HostedTenant } from '../hosted/client'\nimport type {\n EvalRunCellScore,\n EvalRunEvent,\n EvalRunGenerationSnapshot,\n} from '../hosted/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n\n /**\n * Opt-in: ship eval-run events to a hosted orchestrator (ours, your\n * self-hosted one, or any compatible implementation of the\n * `docs/hosted-ingest-spec.md` wire format). When set, the substrate\n * POSTs the final `EvalRunEvent` to `${endpoint}/v1/ingest/eval-runs`\n * after the loop completes. Failures are logged but do not fail the\n * loop — local result is always returned.\n *\n * For our orchestrator: `{ endpoint: 'https://orchestrator.tangle.tools/v1', apiKey, tenantId }`.\n *\n * For your self-hosted: any URL serving the wire format. See\n * `examples/hosted-ingest-server/` for the reference receiver.\n */\n hostedTenant?: HostedTenant\n\n /** Free-form labels attached to the hosted event (env, branch, model id,\n * etc.). Ignored when `hostedTenant` is unset. */\n hostedLabels?: Record<string, string>\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(byScenario: Record<string, { meanComposite: number }>): {\n compositeMean: number\n perScenario: Record<string, number>\n} {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n \"Anchor every claim in something the scenario's brief literally supports.\",\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error(\n 'selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.',\n )\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target:\n opts.driverTarget ??\n 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) =>\n sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n const summary: SelfImproveResult<TScenario, TArtifact> = {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n\n // Opt-in hosted ingest. Failures logged but never fail the loop — the\n // local result is always returned. This matches the wedge-doc invariant\n // that LAND-tier never blocks on EXPAND-tier infra.\n if (opts.hostedTenant) {\n try {\n await shipEvalRunToHosted(opts.hostedTenant, opts, summary, result, runDir)\n } catch (err) {\n const msg = err instanceof Error ? err.message : String(err)\n // eslint-disable-next-line no-console -- intentional: hosted-ingest is best-effort\n console.warn(`[agent-eval] hosted ingest failed (continuing): ${msg}`)\n }\n }\n\n return summary\n}\n\nasync function shipEvalRunToHosted<TScenario extends Scenario, TArtifact>(\n tenant: HostedTenant,\n opts: SelfImproveOptions<TScenario, TArtifact>,\n summary: SelfImproveResult<TScenario, TArtifact>,\n raw: RunImprovementLoopResult<TArtifact, TScenario>,\n runDir: string,\n): Promise<void> {\n const client = createHostedClient(tenant)\n\n function snapshotFromCampaign(\n index: number,\n surface: MutableSurface | undefined,\n campaign: RunImprovementLoopResult<TArtifact, TScenario>['baselineCampaign'],\n durationMs: number,\n ): EvalRunGenerationSnapshot {\n const cells: EvalRunCellScore[] = campaign.cells.map((cell) => {\n const judgeScores = Object.values(cell.judgeScores)\n const composite =\n judgeScores.length === 0\n ? 0\n : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length\n return {\n scenarioId: cell.scenarioId,\n rep: cell.rep,\n compositeMean: composite,\n dimensions: Object.fromEntries(\n Object.entries(cell.judgeScores).map(([name, score]) => [name, score.dimensions]),\n ),\n errorMessage: cell.error ?? undefined,\n }\n })\n const compositeMean =\n cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length\n return {\n index,\n surfaceHash: typeof surface === 'string' ? hashString(surface) : hashString(JSON.stringify(surface ?? '')),\n surface,\n cells,\n compositeMean,\n costUsd: campaign.aggregates.totalCostUsd,\n durationMs,\n }\n }\n\n const generations: EvalRunGenerationSnapshot[] = []\n // Baseline as generation 0.\n generations.push(snapshotFromCampaign(0, opts.baselineSurface, raw.baselineCampaign, 0))\n // Improvement generations as 1..N. Substrate stores per-surface campaigns\n // per generation — we summarize the WINNING surface per generation here.\n for (const gen of raw.generations) {\n const winner = gen.surfaces.reduce((best, s) =>\n s.campaign.aggregates.cellsExecuted > 0 &&\n (best === undefined || averageComposite(s.campaign) > averageComposite(best.campaign))\n ? s\n : best,\n gen.surfaces[0],\n )\n if (!winner) continue\n generations.push(\n snapshotFromCampaign(gen.record.generationIndex + 1, winner.surface, winner.campaign, 0),\n )\n }\n\n const event: EvalRunEvent = {\n runId: `${runDir}#${Date.now()}`,\n runDir,\n timestamp: new Date().toISOString(),\n status: 'finished',\n labels: opts.hostedLabels ?? {},\n baseline: generations[0],\n generations,\n gateDecision: summary.gateDecision,\n holdoutLift: summary.lift,\n totalCostUsd: summary.totalCostUsd,\n totalDurationMs: summary.durationMs,\n }\n\n await client.ingestEvalRun(event)\n}\n\nfunction averageComposite(\n campaign: RunImprovementLoopResult<unknown, Scenario>['baselineCampaign'],\n): number {\n const aggs = Object.values(campaign.aggregates.byScenario)\n return aggs.length === 0 ? 0 : aggs.reduce((s, a) => s + a.meanComposite, 0) / aggs.length\n}\n\nfunction hashString(s: string): string {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h.toString(16).padStart(8, '0')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAsNA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cAAc,YAGrB;AACA,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QACE,KAAK,gBACL;AAAA,IACF,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QACJ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IACjF;AAAA,EACF;AAEF,QAAM,UAAmD;AAAA,IACvD;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AAKA,MAAI,KAAK,cAAc;AACrB,QAAI;AACF,YAAM,oBAAoB,KAAK,cAAc,MAAM,SAAS,QAAQ,MAAM;AAAA,IAC5E,SAAS,KAAK;AACZ,YAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE3D,cAAQ,KAAK,mDAAmD,GAAG,EAAE;AAAA,IACvE;AAAA,EACF;AAEA,SAAO;AACT;AAEA,eAAe,oBACb,QACA,MACA,SACA,KACA,QACe;AACf,QAAM,SAAS,mBAAmB,MAAM;AAExC,WAAS,qBACP,OACA,SACA,UACA,YAC2B;AAC3B,UAAM,QAA4B,SAAS,MAAM,IAAI,CAAC,SAAS;AAC7D,YAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,YAAM,YACJ,YAAY,WAAW,IACnB,IACA,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrE,aAAO;AAAA,QACL,YAAY,KAAK;AAAA,QACjB,KAAK,KAAK;AAAA,QACV,eAAe;AAAA,QACf,YAAY,OAAO;AAAA,UACjB,OAAO,QAAQ,KAAK,WAAW,EAAE,IAAI,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,MAAM,UAAU,CAAC;AAAA,QAClF;AAAA,QACA,cAAc,KAAK,SAAS;AAAA,MAC9B;AAAA,IACF,CAAC;AACD,UAAM,gBACJ,MAAM,WAAW,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,MAAM;AAClF,WAAO;AAAA,MACL;AAAA,MACA,aAAa,OAAO,YAAY,WAAW,WAAW,OAAO,IAAI,WAAW,KAAK,UAAU,WAAW,EAAE,CAAC;AAAA,MACzG;AAAA,MACA;AAAA,MACA;AAAA,MACA,SAAS,SAAS,WAAW;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,cAA2C,CAAC;AAElD,cAAY,KAAK,qBAAqB,GAAG,KAAK,iBAAiB,IAAI,kBAAkB,CAAC,CAAC;AAGvF,aAAW,OAAO,IAAI,aAAa;AACjC,UAAM,SAAS,IAAI,SAAS;AAAA,MAAO,CAAC,MAAM,MACxC,EAAE,SAAS,WAAW,gBAAgB,MACrC,SAAS,UAAa,iBAAiB,EAAE,QAAQ,IAAI,iBAAiB,KAAK,QAAQ,KAChF,IACA;AAAA,MACJ,IAAI,SAAS,CAAC;AAAA,IAChB;AACA,QAAI,CAAC,OAAQ;AACb,gBAAY;AAAA,MACV,qBAAqB,IAAI,OAAO,kBAAkB,GAAG,OAAO,SAAS,OAAO,UAAU,CAAC;AAAA,IACzF;AAAA,EACF;AAEA,QAAM,QAAsB;AAAA,IAC1B,OAAO,GAAG,MAAM,IAAI,KAAK,IAAI,CAAC;AAAA,IAC9B;AAAA,IACA,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,IAClC,QAAQ;AAAA,IACR,QAAQ,KAAK,gBAAgB,CAAC;AAAA,IAC9B,UAAU,YAAY,CAAC;AAAA,IACvB;AAAA,IACA,cAAc,QAAQ;AAAA,IACtB,aAAa,QAAQ;AAAA,IACrB,cAAc,QAAQ;AAAA,IACtB,iBAAiB,QAAQ;AAAA,EAC3B;AAEA,QAAM,OAAO,cAAc,KAAK;AAClC;AAEA,SAAS,iBACP,UACQ;AACR,QAAM,OAAO,OAAO,OAAO,SAAS,WAAW,UAAU;AACzD,SAAO,KAAK,WAAW,IAAI,IAAI,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,eAAe,CAAC,IAAI,KAAK;AACtF;AAEA,SAAS,WAAW,GAAmB;AACrC,MAAI,IAAI,eAAe;AACvB,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,SAAK,EAAE,WAAW,CAAC;AACnB,QAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,EACjC;AACA,SAAO,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG;AACvC;","names":[]}
@@ -0,0 +1,192 @@
1
+ import { M as MutableSurface, i as GateDecision } from '../types-8u72Gc76.js';
2
+
3
+ /**
4
+ * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,
5
+ * a partner's self-hosted one, a future open implementation) must accept.
6
+ *
7
+ * **Stability:** every type in this file is committed under semver. New
8
+ * minors only ADD optional fields. Breaking changes mean a major bump
9
+ * (`HostedWireVersion` literal increment).
10
+ *
11
+ * The wire format is two event streams in one transport:
12
+ *
13
+ * 1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a
14
+ * campaign / improvement-loop completes (or per-generation if
15
+ * streaming). Carries the structured result + per-cell scores +
16
+ * surface diffs the orchestrator stores for the dashboard.
17
+ *
18
+ * 2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped
19
+ * spans with a few additional attributes so the orchestrator can
20
+ * pivot from eval-run → underlying execution. Compatible with any
21
+ * OTel collector.
22
+ *
23
+ * Both endpoints are authenticated with a bearer token + a tenant id
24
+ * header. Tenants isolate everything downstream of ingest; no tenant
25
+ * ever sees another tenant's data.
26
+ */
27
+
28
+ declare const HOSTED_WIRE_VERSION: "2026-05-26.v1";
29
+ type HostedWireVersion = typeof HOSTED_WIRE_VERSION;
30
+ /** Every ingest request carries these. */
31
+ interface HostedIngestHeaders {
32
+ /** Bearer token. The orchestrator validates against the tenant key. */
33
+ authorization: `Bearer ${string}`;
34
+ /** Stable tenant id (the orchestrator-side primary key for the tenant). */
35
+ 'x-tangle-tenant-id': string;
36
+ /** Wire-version pin so the server can reject incompatible payloads. */
37
+ 'x-tangle-wire-version': HostedWireVersion;
38
+ /** Optional idempotency key for retry-safe ingest. */
39
+ 'idempotency-key'?: string;
40
+ }
41
+ /** Lifecycle stages of an eval-run as the substrate reports them. */
42
+ type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored';
43
+ interface EvalRunCellScore {
44
+ /** Stable scenario id from the consumer's scenario set. */
45
+ scenarioId: string;
46
+ /** Repetition index when reps > 1; 0 for the default. */
47
+ rep: number;
48
+ /** Composite score across all judges + dimensions for this cell. */
49
+ compositeMean: number;
50
+ /** Per-judge → per-dimension scores; null where the judge did not run. */
51
+ dimensions: Record<string, Record<string, number>>;
52
+ /** Per-cell error message if the dispatch threw. Null on success. */
53
+ errorMessage?: string;
54
+ }
55
+ interface EvalRunGenerationSnapshot {
56
+ /** Generation index. 0 is baseline. */
57
+ index: number;
58
+ /** Candidate surface fingerprint (stable hash) — pivot key into the
59
+ * trace stream to fetch the underlying execution. */
60
+ surfaceHash: string;
61
+ /** The candidate surface itself. May be omitted to avoid PII when the
62
+ * consumer prefers not to ship verbatim prompts. */
63
+ surface?: MutableSurface;
64
+ /** Per-cell scores for this generation. */
65
+ cells: EvalRunCellScore[];
66
+ /** Aggregate composite mean across all cells in this generation. */
67
+ compositeMean: number;
68
+ /** Total $ spent across this generation. */
69
+ costUsd: number;
70
+ /** Wall-clock duration of this generation. */
71
+ durationMs: number;
72
+ }
73
+ /**
74
+ * The top-level eval-run event. One ingest call per logical eval-run;
75
+ * generations stream in incrementally via repeated calls with the same
76
+ * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.
77
+ */
78
+ interface EvalRunEvent {
79
+ /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */
80
+ runId: string;
81
+ /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */
82
+ runDir: string;
83
+ /** ISO-8601 timestamp the substrate recorded the event. */
84
+ timestamp: string;
85
+ /** Lifecycle stage this event represents. */
86
+ status: EvalRunStatus;
87
+ /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */
88
+ labels: Record<string, string>;
89
+ /** Baseline campaign snapshot. Present when status >= baseline-complete. */
90
+ baseline?: EvalRunGenerationSnapshot;
91
+ /** Per-generation snapshots. Streams in; orchestrator appends. */
92
+ generations: EvalRunGenerationSnapshot[];
93
+ /** Final gate decision. Present when status >= gate-decided. */
94
+ gateDecision?: GateDecision;
95
+ /** Held-out lift = winner-on-holdout - baseline-on-holdout. */
96
+ holdoutLift?: number;
97
+ /** Total $ spent across baseline + every generation. */
98
+ totalCostUsd: number;
99
+ /** Total wall-clock duration. */
100
+ totalDurationMs: number;
101
+ /** Error message if status === 'errored'. */
102
+ errorMessage?: string;
103
+ }
104
+ /**
105
+ * OTel-shape span with a few additional attributes for eval-run pivoting.
106
+ * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,
107
+ * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.
108
+ */
109
+ interface TraceSpanEvent {
110
+ traceId: string;
111
+ spanId: string;
112
+ parentSpanId?: string;
113
+ name: string;
114
+ startTimeUnixNano: number;
115
+ endTimeUnixNano: number;
116
+ attributes: Record<string, string | number | boolean>;
117
+ events?: Array<{
118
+ timeUnixNano: number;
119
+ name: string;
120
+ attributes?: Record<string, string | number | boolean>;
121
+ }>;
122
+ status?: {
123
+ code: 'OK' | 'ERROR' | 'UNSET';
124
+ message?: string;
125
+ };
126
+ /** Pivot back into the eval-run stream. */
127
+ 'tangle.runId'?: string;
128
+ /** Pivot to the specific generation. */
129
+ 'tangle.generation'?: number;
130
+ /** Pivot to the specific cell. */
131
+ 'tangle.cellId'?: string;
132
+ /** Pivot to the specific scenario. */
133
+ 'tangle.scenarioId'?: string;
134
+ }
135
+ interface IngestEvalRunsRequest {
136
+ wireVersion: HostedWireVersion;
137
+ events: EvalRunEvent[];
138
+ }
139
+ interface IngestTracesRequest {
140
+ wireVersion: HostedWireVersion;
141
+ spans: TraceSpanEvent[];
142
+ }
143
+ interface IngestResponse {
144
+ /** Accepted events / spans count. */
145
+ accepted: number;
146
+ /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */
147
+ rejected: Array<{
148
+ index: number;
149
+ reason: string;
150
+ }>;
151
+ }
152
+
153
+ /**
154
+ * # Hosted-tier ingest client.
155
+ *
156
+ * Ships eval-run events + trace spans to any orchestrator (ours, a
157
+ * partner's self-hosted one, or a future open implementation) that
158
+ * speaks the wire format in `./types.ts`.
159
+ *
160
+ * Three modes:
161
+ * - **Ours:** point at `https://orchestrator.tangle.tools/v1`. We
162
+ * handle ingest + storage + dashboard.
163
+ * - **Self-hosted:** point at whatever URL runs the reference receiver
164
+ * from `examples/hosted-ingest-server/`.
165
+ * - **Off (default):** when `hostedTenant` is unset, nothing is sent.
166
+ * Everything stays local.
167
+ */
168
+
169
+ interface HostedTenant {
170
+ /** Orchestrator endpoint base URL (no trailing slash). Required. */
171
+ endpoint: string;
172
+ /** Bearer token issued by the orchestrator. Required. */
173
+ apiKey: string;
174
+ /** Tenant id — the orchestrator's primary key for this consumer. Required. */
175
+ tenantId: string;
176
+ /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */
177
+ fetchImpl?: typeof fetch;
178
+ /** Per-call timeout in ms. Default 30s. */
179
+ timeoutMs?: number;
180
+ /** Retries on 5xx / network errors. Default 2. */
181
+ retries?: number;
182
+ }
183
+ interface HostedClient {
184
+ ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>;
185
+ ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>;
186
+ ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>;
187
+ readonly tenant: HostedTenant;
188
+ readonly wireVersion: HostedWireVersion;
189
+ }
190
+ declare function createHostedClient(tenant: HostedTenant): HostedClient;
191
+
192
+ export { type EvalRunCellScore, type EvalRunEvent, type EvalRunGenerationSnapshot, type EvalRunStatus, HOSTED_WIRE_VERSION, type HostedClient, type HostedIngestHeaders, type HostedTenant, type HostedWireVersion, type IngestEvalRunsRequest, type IngestResponse, type IngestTracesRequest, type TraceSpanEvent, createHostedClient };
@@ -0,0 +1,10 @@
1
+ import {
2
+ HOSTED_WIRE_VERSION,
3
+ createHostedClient
4
+ } from "../chunk-ZQABFCVJ.js";
5
+ import "../chunk-NSBPE2FW.js";
6
+ export {
7
+ HOSTED_WIRE_VERSION,
8
+ createHostedClient
9
+ };
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.45.0",
5
+ "version": "0.46.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -0,0 +1,125 @@
1
+ # Phase D RFC — hosted-tier substrate
2
+
3
+ Pinned scope decisions for the EXPAND tier. What we built, what we
4
+ deliberately did NOT, and what's gated on Phase B evidence.
5
+
6
+ ---
7
+
8
+ ## What's in this version
9
+
10
+ **Wire-format substrate (shipped):**
11
+
12
+ 1. `@tangle-network/agent-eval/hosted` — public client + types for shipping
13
+ eval-run events + trace spans to any orchestrator that speaks the wire
14
+ format.
15
+ 2. `docs/hosted-ingest-spec.md` — semver-committed wire spec
16
+ (`HostedWireVersion = "2026-05-26.v1"`).
17
+ 3. `examples/hosted-ingest-server/` — minimal hono-based reference
18
+ receiver (~200 LOC). Executable spec. Stays as the reference even
19
+ after the production orchestrator ships.
20
+ 4. `selfImprove({ hostedTenant })` opt-in — when set, the substrate
21
+ POSTs the final eval-run event to the configured endpoint. Failures
22
+ are logged but never fail the loop (LAND tier never blocks on
23
+ EXPAND-tier infra).
24
+
25
+ **Production orchestrator (started):**
26
+
27
+ 5. HTTP ingest service in `@tangle-network/monorepo` accepting the wire
28
+ format. Lives under the orchestrator app. Tenant auth + isolation
29
+ + persistent storage + read endpoints. *Started this session — see
30
+ the @tangle-network/agent-dev-container PR. Not feature-complete:
31
+ tenant CRUD + adversarial isolation tests pending.*
32
+
33
+ ## What's deliberately deferred
34
+
35
+ The wedge doc gates these on Phase B evidence — partner-validated
36
+ signal about what the hosted product actually needs to do. Shipping
37
+ them without that signal risks building the wrong thing.
38
+
39
+ | Deferred until Phase B passes | Why |
40
+ |---|---|
41
+ | **Metered billing wire-up (Stripe + cost-ledger)** | The billable units (per-eval-run, per-ingested-MB, per-seat) depend on actual partner consumption patterns. Picking dimensions in a vacuum locks us into wrong pricing. |
42
+ | **Multi-tenant dashboard UX** | Partners' first dashboard request defines the right default views. We have a stub list-runs page; the rest is post-signal. |
43
+ | **Webhook callbacks per tenant** | The events partners want pushed (gate-decided, cost-threshold, regression-alert) are partner-shaped. Add them when a partner asks. |
44
+ | **Cross-tenant aggregation / benchmarking** | This is the "Datadog for agents" tier — explicit roadmap, requires user volume we don't have. |
45
+ | **Sandbox-cost roll-up into hosted billing** | Cross-product billing integration requires PLATFORM-tier partners. Out of scope until at least one. |
46
+ | **Trace UI** | OTel-shape spans store fine. Visualization comes after partners ask. Phoenix / Jaeger / any OTLP-compatible viewer covers it in the interim. |
47
+ | **Soc2 / compliance audit work** | Required for enterprise; not required for design partners. |
48
+
49
+ ## Architecture decisions locked
50
+
51
+ These are committed and won't change without a major-version wire bump
52
+ or a documented migration:
53
+
54
+ 1. **Wire format is JSON over HTTP**, not gRPC. Reasons: works in
55
+ browsers + edge + node + curl; OTel-compatible at the trace stream
56
+ level; lowest possible barrier to a self-hosted orchestrator.
57
+ 2. **Tenant auth is bearer-token + tenant-id header**, not OIDC /
58
+ service-account / mutual-TLS. Reasons: simplest thing that's
59
+ actually secure with proper key handling; defers complex IAM until
60
+ enterprise demand.
61
+ 3. **Idempotency via header, not transactional API**. Servers MUST
62
+ dedupe by `(tenantId, Idempotency-Key)` for 24h. Simpler than
63
+ making clients commit transactions.
64
+ 4. **Eval-runs and traces are SEPARATE streams** with pivot keys
65
+ (`tangle.runId` etc.) on spans. Reasons: traces can be best-effort
66
+ (lossy) without corrupting eval-run semantics; orchestrators can
67
+ prioritize eval-run durability without forcing trace durability.
68
+ 5. **Wire version is a date.v-N string**, not semver. Reasons: dates
69
+ communicate "when was this contract frozen"; v-N captures
70
+ incremental breaking changes between dates.
71
+
72
+ ## Open questions for Phase B to answer
73
+
74
+ When the design-partner pairing happens, capture answers to these
75
+ explicitly:
76
+
77
+ 1. **Surface confidentiality**: do partners want the verbatim surface
78
+ (system prompt) shipped, or just the hash? Today the wire format
79
+ has `surface?` as optional; partner default is what we ship.
80
+ 2. **Trace sampling**: at what cells-per-second do trace spans become
81
+ noise? What's the right default sampling rate?
82
+ 3. **Cost attribution granularity**: per cell? per generation? per
83
+ run? Per judge dimension? Partner needs determine what we surface
84
+ in billing reports.
85
+ 4. **Replay**: do partners want to re-run an old eval-run from the
86
+ stored data? That would require us to store more than the summary —
87
+ actual artifacts + prompts. Storage cost implication.
88
+ 5. **PII / sensitive scenarios**: how do partners want to handle
89
+ scenarios containing user data? Encryption-at-rest is table stakes;
90
+ redaction-at-ingest may be required for some.
91
+
92
+ The partner pairing kit (`docs/phase-b-pairing-kit.md`) has discovery
93
+ questions that probe these.
94
+
95
+ ## Non-goals (explicit)
96
+
97
+ This RFC does NOT plan for:
98
+
99
+ - Replacing Langfuse / Phoenix / Arize. We INGEST OTel; we don't
100
+ build a generic trace viewer. The dashboard is eval-run-shaped, not
101
+ trace-shaped.
102
+ - Becoming a model gateway. Tangle Router exists; the hosted
103
+ orchestrator routes to Tangle Router by default but doesn't
104
+ duplicate its function.
105
+ - Becoming an LLM-call CDN. Caching is the consumer's job (their
106
+ agent code, their HTTP client). We don't intercept LLM calls.
107
+ - Building an "agents IDE." Substrate, not surface.
108
+
109
+ ## Migration path (post Phase B)
110
+
111
+ When Phase B passes the gate, the production orchestrator finishes:
112
+
113
+ 1. Replace in-memory store with Postgres (tenant data) + S3 (large
114
+ artifacts) OR Cloudflare D1 + R2 (Workers-native).
115
+ 2. Wire metered events to Stripe + the cost-ledger.
116
+ 3. Tenant CRUD UI + onboarding flow.
117
+ 4. Multi-tenant dashboard MVP (list runs, drill into one, diff
118
+ generations, view shipped prompt).
119
+ 5. Adversarial tenant-isolation test battery in CI.
120
+ 6. Webhooks + observability for the orchestrator itself.
121
+
122
+ Estimated effort post-Phase-B: ~1 week focused work for one engineer.
123
+ This is fast precisely BECAUSE the wire format is locked and the
124
+ reference receiver exists — the production server is a different
125
+ implementation of the same contract.
@@ -0,0 +1,204 @@
1
+ # Hosted-ingest wire spec — `2026-05-26.v1`
2
+
3
+ The schema **every** orchestrator (ours, partners' self-hosted ones,
4
+ any future open implementation) must accept. Frozen under semver:
5
+ **new minors only add optional fields. Breaking changes mean a major
6
+ bump and a new `HostedWireVersion` literal.**
7
+
8
+ This is the contract that decouples the LAND-tier substrate
9
+ (`@tangle-network/agent-eval`) from the EXPAND-tier hosted product. A
10
+ foreign builder can:
11
+
12
+ - Use our orchestrator at `https://orchestrator.tangle.tools/v1`.
13
+ - Self-host the reference receiver from
14
+ `examples/hosted-ingest-server/`.
15
+ - Implement their own orchestrator against this spec.
16
+
17
+ All three are wire-compatible by definition.
18
+
19
+ ---
20
+
21
+ ## Transport
22
+
23
+ Two endpoints, both `POST`, both JSON. Headers on every request:
24
+
25
+ | Header | Value |
26
+ |---|---|
27
+ | `Authorization` | `Bearer <tenant-key>` (the orchestrator issues this) |
28
+ | `Content-Type` | `application/json` |
29
+ | `X-Tangle-Tenant-Id` | The tenant's stable id (the orchestrator's primary key for the tenant) |
30
+ | `X-Tangle-Wire-Version` | `2026-05-26.v1` (this spec) |
31
+ | `Idempotency-Key` (optional) | UUID; servers MUST treat repeated keys as dedup |
32
+
33
+ Responses are JSON of shape `{ accepted: number, rejected: Array<{ index, reason }> }`. The
34
+ server SHOULD return 202 (accepted, async) or 200 (accepted, synchronous);
35
+ both are equivalent for the wire's purposes.
36
+
37
+ ### `POST /v1/ingest/eval-runs`
38
+
39
+ Body: `IngestEvalRunsRequest = { wireVersion, events: EvalRunEvent[] }`.
40
+
41
+ One ingest call per logical eval-run; generations stream in
42
+ incrementally via repeated calls with the same `runId`. The
43
+ orchestrator deduplicates by `(tenantId, runId, generation.index)`.
44
+
45
+ ### `POST /v1/ingest/traces`
46
+
47
+ Body: `IngestTracesRequest = { wireVersion, spans: TraceSpanEvent[] }`.
48
+
49
+ Standard OTLP-shaped spans with a few additional attributes
50
+ (`tangle.runId`, `tangle.generation`, `tangle.cellId`,
51
+ `tangle.scenarioId`) so the orchestrator can pivot between the
52
+ eval-run stream and the underlying execution trace.
53
+
54
+ ---
55
+
56
+ ## `EvalRunEvent`
57
+
58
+ ```ts
59
+ interface EvalRunEvent {
60
+ runId: string // stable; same id across all generations of one run
61
+ runDir: string // logical run directory (mem://... or filesystem path)
62
+ timestamp: string // ISO-8601
63
+ status: // lifecycle stage this event represents
64
+ | 'started'
65
+ | 'baseline-complete'
66
+ | 'generation-complete'
67
+ | 'gate-decided'
68
+ | 'finished'
69
+ | 'errored'
70
+ labels: Record<string, string> // free-form (env, branch, model id, etc.)
71
+ baseline?: EvalRunGenerationSnapshot // present when status >= baseline-complete
72
+ generations: EvalRunGenerationSnapshot[]
73
+ gateDecision?: // present when status >= gate-decided
74
+ | 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'
75
+ holdoutLift?: number // winner-on-holdout - baseline-on-holdout
76
+ totalCostUsd: number
77
+ totalDurationMs: number
78
+ errorMessage?: string // present when status === 'errored'
79
+ }
80
+ ```
81
+
82
+ ## `EvalRunGenerationSnapshot`
83
+
84
+ ```ts
85
+ interface EvalRunGenerationSnapshot {
86
+ index: number // 0 is baseline; 1..N are improvement generations
87
+ surfaceHash: string // stable hash of the candidate surface (pivot key)
88
+ surface?: MutableSurface // OMITTED to avoid PII when consumer prefers
89
+ cells: EvalRunCellScore[]
90
+ compositeMean: number
91
+ costUsd: number
92
+ durationMs: number
93
+ }
94
+ ```
95
+
96
+ ## `EvalRunCellScore`
97
+
98
+ ```ts
99
+ interface EvalRunCellScore {
100
+ scenarioId: string
101
+ rep: number // 0 for the default; > 0 when reps > 1
102
+ compositeMean: number // composite across all judges + dimensions
103
+ dimensions: Record< // outer key = judge name; inner = dimension name → score
104
+ string,
105
+ Record<string, number>
106
+ >
107
+ errorMessage?: string // present when the dispatch threw
108
+ }
109
+ ```
110
+
111
+ ## `TraceSpanEvent`
112
+
113
+ ```ts
114
+ interface TraceSpanEvent {
115
+ // Standard OTel
116
+ traceId: string
117
+ spanId: string
118
+ parentSpanId?: string
119
+ name: string
120
+ startTimeUnixNano: number
121
+ endTimeUnixNano: number
122
+ attributes: Record<string, string | number | boolean>
123
+ events?: Array<{ timeUnixNano, name, attributes? }>
124
+ status?: { code: 'OK' | 'ERROR' | 'UNSET', message? }
125
+
126
+ // Tangle additions (all optional) for pivoting
127
+ 'tangle.runId'?: string
128
+ 'tangle.generation'?: number
129
+ 'tangle.cellId'?: string
130
+ 'tangle.scenarioId'?: string
131
+ }
132
+ ```
133
+
134
+ ---
135
+
136
+ ## Server requirements
137
+
138
+ Any orchestrator implementing this spec MUST:
139
+
140
+ 1. **Validate auth**: reject without `Authorization` header (401), with a
141
+ mismatched bearer token (401), or without a recognized `X-Tangle-Tenant-Id`
142
+ (404).
143
+ 2. **Validate wire version**: reject incompatible wire versions (400 with
144
+ a clear error message). The major component is the breaking-change axis.
145
+ 3. **Validate tenant isolation**: queries with `tenantId` X never return
146
+ data tagged with `tenantId` Y. Test this adversarially.
147
+ 4. **Honor idempotency**: when an `Idempotency-Key` matches a prior
148
+ request from the same tenant in the last 24h, return the same response
149
+ without double-processing.
150
+ 5. **Persist eval-runs durably**: at least the event + cell scores must
151
+ survive an orchestrator restart. Trace spans MAY be best-effort.
152
+ 6. **Provide read access**: GET endpoints for the tenant to list + fetch
153
+ their own runs. Wire format for reads is NOT part of this spec — each
154
+ orchestrator can pick its own (REST + JSON, gRPC, GraphQL).
155
+
156
+ Servers SHOULD also:
157
+
158
+ - Provide a webhook callback per tenant for `gate-decided` events.
159
+ - Provide a billable-events emitter (Stripe meter / equivalent) per ingest
160
+ call so consumption can be metered.
161
+ - Provide a dashboard or API to view + diff per-scenario lifts over time.
162
+
163
+ ---
164
+
165
+ ## Reference implementation
166
+
167
+ `examples/hosted-ingest-server/` — a minimal hono-based receiver. ~200
168
+ LOC. Validates auth, accepts ingest, stores in memory, exposes a
169
+ read endpoint. Runs anywhere Node runs.
170
+
171
+ ```sh
172
+ TENANT_KEY=dev-token TENANT_ID=acme pnpm tsx examples/hosted-ingest-server/server.ts
173
+ ```
174
+
175
+ In another terminal:
176
+
177
+ ```sh
178
+ HOSTED_ENDPOINT=http://localhost:8080 \
179
+ HOSTED_TENANT_KEY=dev-token \
180
+ HOSTED_TENANT_ID=acme \
181
+ pnpm tsx examples/foreign-agent-quickstart/index.ts
182
+ ```
183
+
184
+ The quickstart's eval-run gets POSTed to the reference receiver; the
185
+ receiver's `GET /v1/runs` lists it back.
186
+
187
+ ---
188
+
189
+ ## Versioning
190
+
191
+ `HostedWireVersion` is `"2026-05-26.v1"`.
192
+
193
+ - Adding an optional field → no version change.
194
+ - Adding a new endpoint or new event type → minor wire bump
195
+ (`2026-05-26.v2`).
196
+ - Changing the shape of an existing field, removing a field, or
197
+ changing semantics of an existing field → major wire bump
198
+ (`2026-11-XX.v1`); a server may accept both versions during a
199
+ transition window.
200
+
201
+ Servers MUST reject requests with `X-Tangle-Wire-Version` they don't
202
+ support, with a 400 listing the versions they DO accept.
203
+
204
+ The version string IS the spec id — pin against it.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.46.0",
3
+ "version": "0.47.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -119,6 +119,11 @@
119
119
  "import": "./dist/adapters/http.js",
120
120
  "default": "./dist/adapters/http.js"
121
121
  },
122
+ "./hosted": {
123
+ "types": "./dist/hosted/index.d.ts",
124
+ "import": "./dist/hosted/index.js",
125
+ "default": "./dist/hosted/index.js"
126
+ },
122
127
  "./openapi.json": {
123
128
  "default": "./dist/openapi.json"
124
129
  }