npm - @tangle-network/agent-eval - Versions diffs - 0.59.1 → 0.61.0 - Mend

@tangle-network/agent-eval 0.59.1 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/CHANGELOG.md +21 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/http.js +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/langchain.js +1 -1
package/dist/adapters/otel.d.ts +5 -5
package/dist/adapters/otel.js +1 -1
package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
package/dist/benchmarks/index.d.ts +3 -3
package/dist/benchmarks/index.js +2 -2
package/dist/builder-eval/index.js +3 -3
package/dist/campaign/index.d.ts +153 -9
package/dist/campaign/index.js +229 -23
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
package/dist/chunk-3BFEG2F6.js.map +1 -0
package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
package/dist/chunk-GMXHLSLL.js.map +1 -0
package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
package/dist/chunk-OLULBECP.js.map +1 -0
package/dist/chunk-PQV2TKC3.js +27 -0
package/dist/chunk-PQV2TKC3.js.map +1 -0
package/dist/chunk-PZ5AY32C.js +10 -0
package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
package/dist/chunk-SUGME4OT.js.map +1 -0
package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
package/dist/cli.js +4 -4
package/dist/contract/index.d.ts +48 -16
package/dist/contract/index.js +59 -19
package/dist/contract/index.js.map +1 -1
package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
package/dist/control.d.ts +5 -5
package/dist/control.js +4 -4
package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
package/dist/governance/index.d.ts +3 -3
package/dist/governance/index.js +1 -1
package/dist/hosted/index.d.ts +5 -5
package/dist/hosted/index.js +1 -1
package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
package/dist/index.d.ts +24 -132
package/dist/index.js +23 -36
package/dist/index.js.map +1 -1
package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
package/dist/knowledge/index.js +1 -1
package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
package/dist/matrix/index.js +1 -1
package/dist/meta-eval/index.d.ts +3 -3
package/dist/meta-eval/index.js +1 -1
package/dist/multishot/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +4 -4
package/dist/prm/index.js +1 -1
package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
package/dist/reporting.d.ts +6 -6
package/dist/reporting.js +5 -5
package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
package/dist/rl.d.ts +9 -9
package/dist/rl.js +8 -8
package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
package/dist/run-campaign-HXPJAUZ3.js +10 -0
package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
package/dist/telemetry/file.js +1 -1
package/dist/telemetry/index.js +1 -1
package/dist/traces.d.ts +2 -2
package/dist/traces.js +4 -4
package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
package/dist/wire/index.d.ts +3 -3
package/dist/wire/index.js +4 -4
package/package.json +1 -1
package/dist/chunk-74Y2EMNH.js.map +0 -1
package/dist/chunk-JB4UWIM6.js.map +0 -1
package/dist/chunk-N4SBKEPJ.js.map +0 -1
package/dist/chunk-NSBPE2FW.js +0 -17
package/dist/chunk-QYJT52YW.js.map +0 -1
package/dist/chunk-ZWEQJIM6.js +0 -220
package/dist/chunk-ZWEQJIM6.js.map +0 -1
package/dist/run-campaign-ZURVWMMI.js +0 -10
/package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
/package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
/package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
/package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
/package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
/package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
/package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
/package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
/package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
/package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
/package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
/package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
/package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
/package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
/package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0

package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
+import { C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';

package/dist/knowledge/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import {
 } from "../chunk-3CKU6VGU.js";
 import "../chunk-NCRFYPS3.js";
 import "../chunk-TVVP3ZZQ.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   acquisitionPlansForKnowledgeGaps,
   blockingKnowledgeEval,

package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
+import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-Dwqw-T_m.js';
 import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
 /**

package/dist/matrix/index.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
   runAgentMatrix,
   summariseRows
 } from "../chunk-QWV226SL.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   buildByAxis,
   runAgentMatrix,

package/dist/meta-eval/index.d.ts CHANGED Viewed

@@ -2,9 +2,9 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
 import { R as Run } from '../schema-m0gsnbt3.js';
 import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-B3qNa4aY.js';
-import '../run-record-etiCMsUq.js';
-import '../errors-mje_cKOs.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-DgBHWsh7.js';
+import '../run-record-DgUVo5pw.js';
+import '../errors-Dwqw-T_m.js';
 /**
  * Correlation study — "does our eval score predict real-world outcomes?"

package/dist/meta-eval/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   llmSpans
 } from "../chunk-47X6LRCE.js";
 import "../chunk-5BKGXME7.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/meta-eval/calibration.ts
 async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {

package/dist/multishot/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   runAgentMatrix
 } from "../chunk-QWV226SL.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/multishot/router.ts
 async function routerCompletion(req) {

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.59.1",
+    "version": "0.61.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/pipelines/index.js CHANGED Viewed

@@ -3,13 +3,13 @@ import {
   classifyFailure,
   compareToBaseline,
   computeToolUseMetrics
-} from "../chunk-QDOSODID.js";
+} from "../chunk-3B7Y5AUR.js";
 import {
   buildTrajectory
 } from "../chunk-RZTMDUO7.js";
 import {
   interRaterReliability
-} from "../chunk-S3SDD56V.js";
+} from "../chunk-ITBRCT73.js";
 import {
   aggregateLlm,
   argHash,
@@ -18,8 +18,8 @@ import {
   toolSpans
 } from "../chunk-47X6LRCE.js";
 import "../chunk-5BKGXME7.js";
-import "../chunk-QYJT52YW.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-3BFEG2F6.js";
+import "../chunk-PZ5AY32C.js";
 // src/pipelines/budget-breach.ts
 async function budgetBreachView(store, options = {}) {

package/dist/prm/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
 import {
   TraceEmitter
 } from "../chunk-TVVP3ZZQ.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/prm/builtin-rubrics.ts
 function outputLengthRubric(args = {}) {

package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} RENAMED Viewed

@@ -1,7 +1,8 @@
-import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
-import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
-import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
-import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { S as Scenario, C as CampaignResult, q as GateResult, v as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, n as CampaignTraceWriter, M as MutableSurface, s as GenerationRecord, p as GateDecision } from './types-Beb6KPqZ.js';
+import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
+import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
+import { R as RunRecord } from './run-record-DgUVo5pw.js';
+import { H as HostedClient, T as TraceSpanEvent } from './index-D9dwa00f.js';
 /**
  * @experimental
@@ -242,7 +243,11 @@ interface CampaignStorage {
 }
 /** Node-filesystem storage — the default. Lazily requires `node:fs` so the
  *  module imports cleanly in non-Node runtimes (where the caller passes
- *  `inMemoryCampaignStorage` instead and never constructs this). */
+ *  `inMemoryCampaignStorage` instead and never constructs this).
+ *
+ *  `createRequire(import.meta.url)` is the ESM-native lazy require — a bare
+ *  `require` is a ReferenceError under `"type": "module"`, which is exactly
+ *  the shape this package publishes. */
 declare function fsCampaignStorage(): CampaignStorage;
 /** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
  *  live in a `Map` for the duration of the run; the `CampaignResult` is
@@ -385,6 +390,14 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
     }>;
     winnerSurface: MutableSurface;
     winnerSurfaceHash: string;
+    /** Driver label for the promoted surface. Present when the winning
+     *  candidate came from a `ProposedCandidate` (a reflective driver);
+     *  absent when the winner is the baseline or a bare-surface mutator. */
+    winnerLabel?: string;
+    /** Driver rationale for the promoted surface — the "because Z" that
+     *  motivated the winning change. Survives to `SelfImproveResult` and the
+     *  emitted provenance record. Absent when the winner is the baseline. */
+    winnerRationale?: string;
     baselineCampaign: CampaignResult<TArtifact, TScenario>;
 }
 declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
@@ -443,8 +456,197 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
     baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
     winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
     gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
+    /** Unified baseline→winner surface diff. Computed UNCONDITIONALLY (not only
+     *  when `autoOnPromote === 'pr'`) so the diff that the gate decided on is
+     *  always present on the result + in the emitted provenance record. Empty
+     *  string when winner == baseline (no change to diff). */
+    promotedDiff: string;
     prResult?: ReturnType<typeof openAutoPr>;
 }
 declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
+declare function defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string;
+/**
+ * @experimental
+ *
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
+ *
+ * Two artifacts, one source of truth:
+ *
+ *   1. `LoopProvenanceRecord` — a structured JSON record capturing every
+ *      candidate (surfaceHash + label + rationale), its measured composite,
+ *      the gate decision + reasons + delta, the held-out lift, the explicit
+ *      baseline→candidate diff, and BACKEND PROVENANCE (the
+ *      `assertRealBackend` verdict + worker call count + model). This is the
+ *      ingestable audit artifact: the +lift recomputes from it, the "because
+ *      Z" rationale survives in it, and a stub backend is detectable from it.
+ *
+ *   2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
+ *      `TraceSpanEvent`s, pivoted on the substrate's standard
+ *      `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
+ *      `tangle.generation` attributes (the same pivots `/adapters/otel`
+ *      reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
+ *      not just the `cost.*` spans `runCampaign` already emits per cell.
+ *
+ * The record is built from the substrate's own loop result + the per-call
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
+ * could drift from what the gate actually saw.
+ */
+/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
+ *  their worktree+base identity since the content lives in git. Distinct from
+ *  `surfaceHash` (16-char content fingerprint used as a loop identity key);
+ *  this is the byte-identical-verifiable content hash the provenance record +
+ *  `RunRecord.promptHash` carry. */
+declare function surfaceContentHash(surface: MutableSurface): string;
+interface LoopProvenanceCandidate {
+    /** Generation index this candidate was proposed in. */
+    generation: number;
+    /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
+    surfaceHash: string;
+    /** Full sha256 content hash — byte-identical-verifiable. */
+    contentHash: string;
+    /** Driver label, when the driver returned a `ProposedCandidate`. */
+    label?: string;
+    /** Driver rationale — the "because Z". When the driver returned a bare
+     *  surface (blind mutator) this is absent. */
+    rationale?: string;
+    /** Mean composite this candidate scored on the search split. */
+    composite: number;
+    /** Whether this candidate was promoted out of its generation. */
+    promoted: boolean;
+}
+interface LoopProvenanceBackend {
+    /** `assertRealBackend`-grade verdict over the worker call records. */
+    verdict: 'real' | 'mixed' | 'stub';
+    /** Number of worker LLM calls captured (the audit's "worker call count"). */
+    workerCallCount: number;
+    /** Distinct model ids observed across worker calls. */
+    models: string[];
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalCostUsd: number;
+}
+/**
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
+ * the bare hosted event) + backend provenance.
+ */
+interface LoopProvenanceRecord {
+    schema: 'tangle.loop-provenance.v1';
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
+    baselineContentHash: string;
+    winnerContentHash: string;
+    /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
+    winnerLabel?: string;
+    winnerRationale?: string;
+    /** The explicit baseline→winner unified diff the gate decided on. */
+    diff: string;
+    /** Every candidate across every generation, each carrying its rationale. */
+    candidates: LoopProvenanceCandidate[];
+    /** The gate verdict — decision + reasons + contributing gates + delta. */
+    gate: {
+        decision: GateDecision;
+        reasons: string[];
+        delta?: number;
+        contributingGates: Array<{
+            name: string;
+            passed: boolean;
+        }>;
+    };
+    /** baseline-on-holdout composite mean. */
+    baselineHoldoutComposite: number;
+    /** winner-on-holdout composite mean. */
+    winnerHoldoutComposite: number;
+    /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
+    heldOutLift: number;
+    /** Backend provenance: stub-vs-real verdict + worker call count + models. */
+    backend: LoopProvenanceBackend;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    baselineSurface: MutableSurface;
+    winnerSurface: MutableSurface;
+    winnerLabel?: string;
+    winnerRationale?: string;
+    diff: string;
+    /** Per-generation candidate records straight off the loop result. */
+    generations: Array<{
+        generationIndex: number;
+        candidates: Array<{
+            surfaceHash: string;
+            composite: number;
+            label?: string;
+            rationale?: string;
+        }>;
+        promoted: string[];
+        /** Surfaces measured this generation, keyed positionally to candidates so
+         *  the content hash can be computed from the real surface text. */
+        surfaces: Array<{
+            surfaceHash: string;
+            surface: MutableSurface;
+        }>;
+    }>;
+    gate: GateResult;
+    baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
+    winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
+    /** Worker call records — the source for backend provenance. */
+    workerRecords: ReadonlyArray<RunRecord>;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+/** Build the durable provenance record from a completed loop result. */
+declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
+/**
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
+ * span per loop (`tangle.runId`), one span per generation, one span per
+ * candidate (carrying its surfaceHash + label), and one span for the gate
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
+ * reads, so the hosted collector reconstructs the full tree.
+ *
+ * Times are synthesized monotonically off a single base so the span tree is
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
+ */
+declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
+    baseTimeMs?: number;
+}): TraceSpanEvent[];
+/** Canonical durable paths under the run dir. */
+declare function provenanceRecordPath(runDir: string): string;
+declare function provenanceSpansPath(runDir: string): string;
+interface EmitLoopProvenanceResult {
+    record: LoopProvenanceRecord;
+    spans: TraceSpanEvent[];
+    /** Absolute paths the record + spans were written to, when storage persists. */
+    recordPath: string;
+    spansPath: string;
+}
+interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
+    /** Storage the record + spans are written through. */
+    storage: CampaignStorage;
+    /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
+     *  endpoint so the collector receives the full loop, not just `cost.*`. */
+    hostedClient?: HostedClient;
+}
+/**
+ * Build the provenance record + OTel spans and persist them durably under the
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
+ * both artifacts so the caller can assert on / re-derive from them.
+ *
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
+ * collector never fails the loop (the durable artifact is the source of truth).
+ */
+declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
-export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };
+export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };

package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
+import { a as DatasetScenario, b as Dataset } from './dataset-B2kL-fSM.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 /**

package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
-import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
+import { R as RunRecord } from './run-record-DgUVo5pw.js';
 import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
 import { J as JudgeInput } from './types-DhqpAi_z.js';

package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
 import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 import { a as JudgeScore } from './types-DhqpAi_z.js';
-import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
-import { m as GateDecision } from './summary-report-DLxh4yWk.js';
-import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
+import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
+import { m as GateDecision } from './summary-report-BQvXpvaR.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
 /**
  * Release confidence gate.

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,14 +1,14 @@
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-B3qNa4aY.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DmPjIce3.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-DgBHWsh7.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
-import './run-record-etiCMsUq.js';
-import './errors-mje_cKOs.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
+import './run-record-DgUVo5pw.js';
+import './errors-Dwqw-T_m.js';
 import './schema-m0gsnbt3.js';
 import './outcome-store-D6KWmYvj.js';
 import './judge-calibration-DilmB3Ml.js';
 import './types-DhqpAi_z.js';
 import '@tangle-network/tcloud';
-import './dataset-BlwAtYYf.js';
+import './dataset-B2kL-fSM.js';
 import './failure-cluster-CL7IVgkJ.js';
 import './store-CKUAgsJz.js';

package/dist/reporting.js CHANGED Viewed

@@ -4,7 +4,7 @@ import {
   evaluateReleaseConfidence,
   judgeReplayGate,
   renderReleaseReport
-} from "./chunk-AIXHUIHG.js";
+} from "./chunk-B26KI423.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
@@ -18,15 +18,15 @@ import {
   paretoChart,
   researchReport,
   summaryTable
-} from "./chunk-OLIBRKRD.js";
+} from "./chunk-KX6F6NCG.js";
 import {
   benjaminiHochberg,
   pairedBootstrap,
   wilcoxonSignedRank
-} from "./chunk-S3SDD56V.js";
+} from "./chunk-ITBRCT73.js";
 import "./chunk-VSMTAMNK.js";
-import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-3BFEG2F6.js";
+import "./chunk-PZ5AY32C.js";
 export {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
   assertReleaseConfidence,

package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
-import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-etiCMsUq.js';
-import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
-import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-DLxh4yWk.js';
+import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-DgUVo5pw.js';
+import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
+import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BQvXpvaR.js';
 import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
-import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CfXjSqEv.js';
+import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
 import { F as FailureClass } from './schema-m0gsnbt3.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';

package/dist/rl.d.ts CHANGED Viewed

@@ -1,20 +1,20 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
-import { k as CampaignResult } from './types-BgrxOJSf.js';
-import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-JP8EvnLv.js';
-export { r as runEvalCampaign } from './researcher-JP8EvnLv.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-DgUVo5pw.js';
+import { C as CampaignResult } from './types-Beb6KPqZ.js';
+import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-BaVsy0sW.js';
+export { r as runEvalCampaign } from './researcher-BaVsy0sW.js';
 import { S as Span } from './schema-m0gsnbt3.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
-import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-B3qNa4aY.js';
+import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-DgBHWsh7.js';
 import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
-import './errors-mje_cKOs.js';
-import './llm-client-BXVRUZyX.js';
+import './errors-Dwqw-T_m.js';
+import './llm-client-DbjLfz-K.js';
 import './raw-provider-sink-C46HDghv.js';
-import './summary-report-DLxh4yWk.js';
+import './summary-report-BQvXpvaR.js';
 import './failure-cluster-CL7IVgkJ.js';
 import './emitter-DEZwY14K.js';
-import './integrity-CfXjSqEv.js';
+import './integrity-CJzrpUua.js';
 /**
  * Test-time compute scaling curves.

package/dist/rl.js CHANGED Viewed

@@ -10,28 +10,28 @@ import {
 } from "./chunk-3RF76KTD.js";
 import {
   runEvalCampaign
-} from "./chunk-GM476SZU.js";
-import "./chunk-NCK5QLGT.js";
+} from "./chunk-AIWHLG7J.js";
+import "./chunk-F3SRAAZO.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
 import {
   evaluateInterimReleaseConfidence
 } from "./chunk-MAZ26DC7.js";
-import "./chunk-OLIBRKRD.js";
+import "./chunk-KX6F6NCG.js";
 import {
   benjaminiHochberg,
   wilcoxonSignedRank
-} from "./chunk-S3SDD56V.js";
-import "./chunk-UBPIXOC4.js";
+} from "./chunk-ITBRCT73.js";
+import "./chunk-SBCB6VZY.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
-import "./chunk-VXNVVBZO.js";
+import "./chunk-IHDHUN2X.js";
 import "./chunk-PC4UYEBM.js";
 import {
   ValidationError
-} from "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+} from "./chunk-3BFEG2F6.js";
+import "./chunk-PZ5AY32C.js";
 // src/rl/compute-curves.ts
 async function runComputeCurve(opts) {

package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { R as RunRecord } from './run-record-DgUVo5pw.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 /**

package/dist/run-campaign-HXPJAUZ3.js ADDED Viewed

@@ -0,0 +1,10 @@
+import {
+  runCampaign
+} from "./chunk-OLULBECP.js";
+import "./chunk-ITBRCT73.js";
+import "./chunk-3BFEG2F6.js";
+import "./chunk-PZ5AY32C.js";
+export {
+  runCampaign
+};
+//# sourceMappingURL=run-campaign-HXPJAUZ3.js.map

package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { V as ValidationError } from './errors-mje_cKOs.js';
+import { V as ValidationError } from './errors-Dwqw-T_m.js';
 import { F as FailureClass } from './schema-m0gsnbt3.js';
 type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';

package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { R as RunRecord } from './run-record-DgUVo5pw.js';
 import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
 /**

package/dist/telemetry/file.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/telemetry/sink-file.ts
 import * as fs from "fs";

package/dist/telemetry/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/telemetry/schema.ts
 var TELEMETRY_SCHEMA_VERSION = 1;

package/dist/traces.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
+import { N as NotFoundError, R as ReplayError } from './errors-Dwqw-T_m.js';
 import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
 export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
 import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
 export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
-export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
+export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';

package/dist/traces.js CHANGED Viewed

@@ -34,7 +34,7 @@ import {
   tokenizeDomainWords,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete
-} from "./chunk-PIEAE33T.js";
+} from "./chunk-Z4ZCBC7M.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,
@@ -64,7 +64,7 @@ import {
   RunIntegrityError,
   assertRunCaptured,
   throwIfRunIncomplete
-} from "./chunk-UBPIXOC4.js";
+} from "./chunk-SBCB6VZY.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
@@ -77,8 +77,8 @@ import {
   defaultProviderRedactor,
   providerFromBaseUrl
 } from "./chunk-PC4UYEBM.js";
-import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-3BFEG2F6.js";
+import "./chunk-PZ5AY32C.js";
 export {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,