npm - @tangle-network/agent-eval - Versions diffs - 0.59.1 → 0.60.0 - Mend

@tangle-network/agent-eval 0.59.1 → 0.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/http.js +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/langchain.js +1 -1
package/dist/adapters/otel.d.ts +2 -2
package/dist/adapters/otel.js +1 -1
package/dist/benchmarks/index.js +2 -2
package/dist/builder-eval/index.js +1 -1
package/dist/campaign/index.d.ts +7 -3
package/dist/campaign/index.js +21 -16
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
package/dist/chunk-GBHRUAOF.js.map +1 -0
package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
package/dist/chunk-LBSXXH56.js.map +1 -0
package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
package/dist/chunk-PZ5AY32C.js +10 -0
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/contract/index.d.ts +42 -10
package/dist/contract/index.js +55 -15
package/dist/contract/index.js.map +1 -1
package/dist/control.js +1 -1
package/dist/governance/index.js +1 -1
package/dist/hosted/index.d.ts +2 -2
package/dist/hosted/index.js +1 -1
package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
package/dist/index.js +8 -8
package/dist/knowledge/index.js +1 -1
package/dist/matrix/index.js +1 -1
package/dist/meta-eval/index.js +1 -1
package/dist/multishot/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +1 -1
package/dist/prm/index.js +1 -1
package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
package/dist/reporting.js +1 -1
package/dist/rl.d.ts +1 -1
package/dist/rl.js +1 -1
package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
package/dist/telemetry/file.js +1 -1
package/dist/telemetry/index.js +1 -1
package/dist/traces.js +1 -1
package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
package/dist/wire/index.js +1 -1
package/package.json +25 -12
package/dist/chunk-JB4UWIM6.js.map +0 -1
package/dist/chunk-N4SBKEPJ.js.map +0 -1
package/dist/chunk-NSBPE2FW.js +0 -17
package/dist/chunk-ZWEQJIM6.js +0 -220
package/dist/chunk-ZWEQJIM6.js.map +0 -1
/package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
/package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
/package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0

package/dist/control.js CHANGED Viewed

@@ -17,7 +17,7 @@ import "./chunk-NCK5QLGT.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
 import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 export {
   allCriticalPassed,
   controlRunToRunRecord,

package/dist/governance/index.js CHANGED Viewed

@@ -6,7 +6,7 @@ import {
   soc2Report,
   summarize
 } from "../chunk-KKHDIONI.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   classifyEuAiRisk,
   euAiActReport,

package/dist/hosted/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
-import '../types-BgrxOJSf.js';
+export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-BIkvdkSU.js';
+import '../types-VCIXx_yo.js';
 import '../summary-report-DLxh4yWk.js';
 import '../run-record-etiCMsUq.js';
 import '../errors-mje_cKOs.js';

package/dist/hosted/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   HOSTED_WIRE_VERSION,
   createHostedClient
 } from "../chunk-FQK2CCIM.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   HOSTED_WIRE_VERSION,
   createHostedClient

package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
+import { M as MutableSurface, n as GateDecision } from './types-VCIXx_yo.js';
 import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
 import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';

package/dist/index.js CHANGED Viewed

@@ -1,18 +1,17 @@
 import {
-  BackendIntegrityError,
   HoldoutAuditor,
-  assertRealBackend,
   canaryLeakView,
   checkBehavioralCanary,
   checkCanaries,
-  runBehavioralCanaries,
-  summarizeBackendIntegrity
-} from "./chunk-ZWEQJIM6.js";
+  runBehavioralCanaries
+} from "./chunk-SHTXZ4O2.js";
 import {
+  BackendIntegrityError,
   DEFAULT_MUTATION_PRIMITIVES,
   DEFAULT_RED_TEAM_CORPUS,
   Dataset,
   HoldoutLockedError,
+  assertRealBackend,
   buildReflectionPrompt,
   hashScenarios,
   parseReflectionResponse,
@@ -20,13 +19,14 @@ import {
   redTeamReport,
   runCanaries,
   scoreRedTeamOutput,
+  summarizeBackendIntegrity,
   toolNamesForRun
-} from "./chunk-N4SBKEPJ.js";
+} from "./chunk-GBHRUAOF.js";
 import {
   BENCHMARK_SPLIT_SEED,
   benchmarks_exports,
   deterministicSplit
-} from "./chunk-MHQPVHXU.js";
+} from "./chunk-6QDKWHLS.js";
 import {
   DEFAULT_RULES,
   classifyFailure,
@@ -260,7 +260,7 @@ import {
   ValidationError,
   VerificationError
 } from "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 // src/run-score.ts
 var DEFAULT_RUN_SCORE_WEIGHTS = {

package/dist/knowledge/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import {
 } from "../chunk-3CKU6VGU.js";
 import "../chunk-NCRFYPS3.js";
 import "../chunk-TVVP3ZZQ.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   acquisitionPlansForKnowledgeGaps,
   blockingKnowledgeEval,

package/dist/matrix/index.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
   runAgentMatrix,
   summariseRows
 } from "../chunk-QWV226SL.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   buildByAxis,
   runAgentMatrix,

package/dist/meta-eval/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   llmSpans
 } from "../chunk-47X6LRCE.js";
 import "../chunk-5BKGXME7.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/meta-eval/calibration.ts
 async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {

package/dist/multishot/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   runAgentMatrix
 } from "../chunk-QWV226SL.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/multishot/router.ts
 async function routerCompletion(req) {

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.59.1",
+    "version": "0.60.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/pipelines/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import {
 } from "../chunk-47X6LRCE.js";
 import "../chunk-5BKGXME7.js";
 import "../chunk-QYJT52YW.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/pipelines/budget-breach.ts
 async function budgetBreachView(store, options = {}) {

package/dist/prm/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
 import {
   TraceEmitter
 } from "../chunk-TVVP3ZZQ.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/prm/builtin-rubrics.ts
 function outputLengthRubric(args = {}) {

package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} RENAMED Viewed

@@ -1,7 +1,8 @@
-import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
+import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord, n as GateDecision } from './types-VCIXx_yo.js';
 import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
 import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
 import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { H as HostedClient, T as TraceSpanEvent } from './index-BIkvdkSU.js';
 /**
  * @experimental
@@ -242,7 +243,11 @@ interface CampaignStorage {
 }
 /** Node-filesystem storage — the default. Lazily requires `node:fs` so the
  *  module imports cleanly in non-Node runtimes (where the caller passes
- *  `inMemoryCampaignStorage` instead and never constructs this). */
+ *  `inMemoryCampaignStorage` instead and never constructs this).
+ *
+ *  `createRequire(import.meta.url)` is the ESM-native lazy require — a bare
+ *  `require` is a ReferenceError under `"type": "module"`, which is exactly
+ *  the shape this package publishes. */
 declare function fsCampaignStorage(): CampaignStorage;
 /** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
  *  live in a `Map` for the duration of the run; the `CampaignResult` is
@@ -385,6 +390,14 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
     }>;
     winnerSurface: MutableSurface;
     winnerSurfaceHash: string;
+    /** Driver label for the promoted surface. Present when the winning
+     *  candidate came from a `ProposedCandidate` (a reflective driver);
+     *  absent when the winner is the baseline or a bare-surface mutator. */
+    winnerLabel?: string;
+    /** Driver rationale for the promoted surface — the "because Z" that
+     *  motivated the winning change. Survives to `SelfImproveResult` and the
+     *  emitted provenance record. Absent when the winner is the baseline. */
+    winnerRationale?: string;
     baselineCampaign: CampaignResult<TArtifact, TScenario>;
 }
 declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
@@ -443,8 +456,197 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
     baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
     winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
     gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
+    /** Unified baseline→winner surface diff. Computed UNCONDITIONALLY (not only
+     *  when `autoOnPromote === 'pr'`) so the diff that the gate decided on is
+     *  always present on the result + in the emitted provenance record. Empty
+     *  string when winner == baseline (no change to diff). */
+    promotedDiff: string;
     prResult?: ReturnType<typeof openAutoPr>;
 }
 declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
+declare function defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string;
+/**
+ * @experimental
+ *
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
+ *
+ * Two artifacts, one source of truth:
+ *
+ *   1. `LoopProvenanceRecord` — a structured JSON record capturing every
+ *      candidate (surfaceHash + label + rationale), its measured composite,
+ *      the gate decision + reasons + delta, the held-out lift, the explicit
+ *      baseline→candidate diff, and BACKEND PROVENANCE (the
+ *      `assertRealBackend` verdict + worker call count + model). This is the
+ *      ingestable audit artifact: the +lift recomputes from it, the "because
+ *      Z" rationale survives in it, and a stub backend is detectable from it.
+ *
+ *   2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
+ *      `TraceSpanEvent`s, pivoted on the substrate's standard
+ *      `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
+ *      `tangle.generation` attributes (the same pivots `/adapters/otel`
+ *      reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
+ *      not just the `cost.*` spans `runCampaign` already emits per cell.
+ *
+ * The record is built from the substrate's own loop result + the per-call
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
+ * could drift from what the gate actually saw.
+ */
+/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
+ *  their worktree+base identity since the content lives in git. Distinct from
+ *  `surfaceHash` (16-char content fingerprint used as a loop identity key);
+ *  this is the byte-identical-verifiable content hash the provenance record +
+ *  `RunRecord.promptHash` carry. */
+declare function surfaceContentHash(surface: MutableSurface): string;
+interface LoopProvenanceCandidate {
+    /** Generation index this candidate was proposed in. */
+    generation: number;
+    /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
+    surfaceHash: string;
+    /** Full sha256 content hash — byte-identical-verifiable. */
+    contentHash: string;
+    /** Driver label, when the driver returned a `ProposedCandidate`. */
+    label?: string;
+    /** Driver rationale — the "because Z". When the driver returned a bare
+     *  surface (blind mutator) this is absent. */
+    rationale?: string;
+    /** Mean composite this candidate scored on the search split. */
+    composite: number;
+    /** Whether this candidate was promoted out of its generation. */
+    promoted: boolean;
+}
+interface LoopProvenanceBackend {
+    /** `assertRealBackend`-grade verdict over the worker call records. */
+    verdict: 'real' | 'mixed' | 'stub';
+    /** Number of worker LLM calls captured (the audit's "worker call count"). */
+    workerCallCount: number;
+    /** Distinct model ids observed across worker calls. */
+    models: string[];
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalCostUsd: number;
+}
+/**
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
+ * the bare hosted event) + backend provenance.
+ */
+interface LoopProvenanceRecord {
+    schema: 'tangle.loop-provenance.v1';
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
+    baselineContentHash: string;
+    winnerContentHash: string;
+    /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
+    winnerLabel?: string;
+    winnerRationale?: string;
+    /** The explicit baseline→winner unified diff the gate decided on. */
+    diff: string;
+    /** Every candidate across every generation, each carrying its rationale. */
+    candidates: LoopProvenanceCandidate[];
+    /** The gate verdict — decision + reasons + contributing gates + delta. */
+    gate: {
+        decision: GateDecision;
+        reasons: string[];
+        delta?: number;
+        contributingGates: Array<{
+            name: string;
+            passed: boolean;
+        }>;
+    };
+    /** baseline-on-holdout composite mean. */
+    baselineHoldoutComposite: number;
+    /** winner-on-holdout composite mean. */
+    winnerHoldoutComposite: number;
+    /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
+    heldOutLift: number;
+    /** Backend provenance: stub-vs-real verdict + worker call count + models. */
+    backend: LoopProvenanceBackend;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    baselineSurface: MutableSurface;
+    winnerSurface: MutableSurface;
+    winnerLabel?: string;
+    winnerRationale?: string;
+    diff: string;
+    /** Per-generation candidate records straight off the loop result. */
+    generations: Array<{
+        generationIndex: number;
+        candidates: Array<{
+            surfaceHash: string;
+            composite: number;
+            label?: string;
+            rationale?: string;
+        }>;
+        promoted: string[];
+        /** Surfaces measured this generation, keyed positionally to candidates so
+         *  the content hash can be computed from the real surface text. */
+        surfaces: Array<{
+            surfaceHash: string;
+            surface: MutableSurface;
+        }>;
+    }>;
+    gate: GateResult;
+    baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
+    winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
+    /** Worker call records — the source for backend provenance. */
+    workerRecords: ReadonlyArray<RunRecord>;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+/** Build the durable provenance record from a completed loop result. */
+declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
+/**
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
+ * span per loop (`tangle.runId`), one span per generation, one span per
+ * candidate (carrying its surfaceHash + label), and one span for the gate
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
+ * reads, so the hosted collector reconstructs the full tree.
+ *
+ * Times are synthesized monotonically off a single base so the span tree is
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
+ */
+declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
+    baseTimeMs?: number;
+}): TraceSpanEvent[];
+/** Canonical durable paths under the run dir. */
+declare function provenanceRecordPath(runDir: string): string;
+declare function provenanceSpansPath(runDir: string): string;
+interface EmitLoopProvenanceResult {
+    record: LoopProvenanceRecord;
+    spans: TraceSpanEvent[];
+    /** Absolute paths the record + spans were written to, when storage persists. */
+    recordPath: string;
+    spansPath: string;
+}
+interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
+    /** Storage the record + spans are written through. */
+    storage: CampaignStorage;
+    /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
+     *  endpoint so the collector receives the full loop, not just `cost.*`. */
+    hostedClient?: HostedClient;
+}
+/**
+ * Build the provenance record + OTel spans and persist them durably under the
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
+ * both artifacts so the caller can assert on / re-derive from them.
+ *
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
+ * collector never fails the loop (the durable artifact is the source of truth).
+ */
+declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
-export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };
+export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };

package/dist/reporting.js CHANGED Viewed

@@ -26,7 +26,7 @@ import {
 } from "./chunk-S3SDD56V.js";
 import "./chunk-VSMTAMNK.js";
 import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 export {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
   assertReleaseConfidence,

package/dist/rl.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
-import { k as CampaignResult } from './types-BgrxOJSf.js';
+import { k as CampaignResult } from './types-VCIXx_yo.js';
 import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-JP8EvnLv.js';
 export { r as runEvalCampaign } from './researcher-JP8EvnLv.js';
 import { S as Span } from './schema-m0gsnbt3.js';

package/dist/rl.js CHANGED Viewed

@@ -31,7 +31,7 @@ import "./chunk-PC4UYEBM.js";
 import {
   ValidationError
 } from "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 // src/rl/compute-curves.ts
 async function runComputeCurve(opts) {

package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} RENAMED Viewed

@@ -1,10 +1,10 @@
 import {
   runCampaign
-} from "./chunk-74Y2EMNH.js";
+} from "./chunk-NOPYCRNG.js";
 import "./chunk-S3SDD56V.js";
 import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 export {
   runCampaign
 };
-//# sourceMappingURL=run-campaign-ZURVWMMI.js.map
+//# sourceMappingURL=run-campaign-5XENUKRF.js.map

package/dist/telemetry/file.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/telemetry/sink-file.ts
 import * as fs from "fs";

package/dist/telemetry/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 // src/telemetry/schema.ts
 var TELEMETRY_SCHEMA_VERSION = 1;

package/dist/traces.js CHANGED Viewed

@@ -78,7 +78,7 @@ import {
   providerFromBaseUrl
 } from "./chunk-PC4UYEBM.js";
 import "./chunk-QYJT52YW.js";
-import "./chunk-NSBPE2FW.js";
+import "./chunk-PZ5AY32C.js";
 export {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,

package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} RENAMED Viewed

@@ -118,6 +118,24 @@ interface CodeSurface {
  *  Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
  *  not this type. */
 type MutableSurface = string | CodeSurface;
+/** @experimental A driver proposal carrying the surface AND the WHY behind
+ *  it. Reflective drivers (`gepaDriver`) parse a `{label, rationale, payload}`
+ *  from the model; without this wrapper the loop keeps only `payload` and the
+ *  rationale that motivated the change is lost — the candidate becomes
+ *  unattributable. `propose()` may return either bare `MutableSurface`s (cheap
+ *  blind mutators) or these (reflective drivers); the loop normalizes both. */
+interface ProposedCandidate {
+    surface: MutableSurface;
+    /** Short human label for the change (≤ 40 chars typical). */
+    label: string;
+    /** Why this change was proposed — which failure it targets, which
+     *  primitive it used. Survives to `GenerationCandidate.rationale` and the
+     *  emitted provenance record. */
+    rationale: string;
+}
+/** @experimental Type guard: a proposal carrying its rationale vs a bare
+ *  surface. The loop branches on this to populate `GenerationCandidate`. */
+declare function isProposedCandidate(value: MutableSurface | ProposedCandidate): value is ProposedCandidate;
 /** @experimental Stateless surface mutation — given findings + current
  *  surface, return N candidate surfaces. Pure transform, no generation
  *  awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
@@ -129,7 +147,7 @@ interface Mutator<TFindings = unknown> {
         currentSurface: MutableSurface;
         populationSize: number;
         signal: AbortSignal;
-    }): Promise<MutableSurface[]>;
+    }): Promise<Array<MutableSurface | ProposedCandidate>>;
 }
 /** @experimental Everything a driver's `propose()` may read to plan the next
  *  batch of candidates. The first six fields are always present; the rest are
@@ -169,8 +187,11 @@ interface ProposeContext<TFindings = unknown> {
  *  are driver-agnostic. */
 interface ImprovementDriver<TFindings = unknown> {
     kind: string;
-    /** Plan: propose N candidate surfaces for the next generation. */
-    propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
+    /** Plan: propose N candidate surfaces for the next generation. A driver
+     *  may return bare `MutableSurface`s or `ProposedCandidate`s that carry the
+     *  `{label, rationale}` motivating the change — the loop threads the
+     *  rationale into `GenerationCandidate` and the emitted provenance. */
+    propose(ctx: ProposeContext<TFindings>): Promise<Array<MutableSurface | ProposedCandidate>>;
     /** Decide: stop early when the driver judges the search converged or
      *  exhausted. Default (omitted) runs all `maxGenerations`. */
     decide?(args: {
@@ -368,6 +389,13 @@ interface GenerationCandidate {
         scenarioId: string;
         composite: number;
     }>;
+    /** Driver-supplied short label for the change. Present when the driver
+     *  returned a `ProposedCandidate`; absent for bare-surface mutators. */
+    label?: string;
+    /** Driver-supplied rationale — WHY this candidate was proposed. The
+     *  "because rationale Z" the audit requires to survive to the result.
+     *  Present when the driver returned a `ProposedCandidate`. */
+    rationale?: string;
 }
 interface CampaignAggregates {
     byJudge: Record<string, JudgeAggregate>;
@@ -402,4 +430,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };
+export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ProposedCandidate as v, type ScenarioAggregate as w, type SessionScript as x, isProposedCandidate as y, labelTrustRank as z };

package/dist/wire/index.js CHANGED Viewed

@@ -38,7 +38,7 @@ import {
 import "../chunk-VXNVVBZO.js";
 import "../chunk-PC4UYEBM.js";
 import "../chunk-QYJT52YW.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
 export {
   BUILTIN_RUBRICS,
   ErrorResponseSchema,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.59.1",
+  "version": "0.60.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -144,6 +144,18 @@
   "publishConfig": {
     "access": "public"
   },
+  "scripts": {
+    "build": "tsup && pnpm openapi",
+    "dev": "tsup --watch",
+    "prepare": "husky",
+    "prepublishOnly": "pnpm build",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "typecheck": "tsc --noEmit",
+    "lint": "biome check src",
+    "format": "biome format --write src",
+    "openapi": "node dist/cli.js openapi --out dist/openapi.json"
+  },
   "dependencies": {
     "@asteasolutions/zod-to-openapi": "^8.5.0",
     "@ax-llm/ax": "^19.0.25",
@@ -171,6 +183,16 @@
     "typescript": "^5.7.0",
     "vitest": "^3.0.0"
   },
+  "pnpm": {
+    "minimumReleaseAge": 4320,
+    "minimumReleaseAgeExclude": [
+      "@tangle-network/sandbox"
+    ],
+    "overrides": {
+      "postcss@<8.5.10": "^8.5.10",
+      "ws@>=8.0.0 <8.20.1": "^8.20.1"
+    }
+  },
   "engines": {
     "node": ">=20"
   },
@@ -183,14 +205,5 @@
     ]
   },
   "license": "MIT",
-  "scripts": {
-    "build": "tsup && pnpm openapi",
-    "dev": "tsup --watch",
-    "test": "vitest run",
-    "test:watch": "vitest",
-    "typecheck": "tsc --noEmit",
-    "lint": "biome check src",
-    "format": "biome format --write src",
-    "openapi": "node dist/cli.js openapi --out dist/openapi.json"
-  }
-}
+  "packageManager": "pnpm@10.22.0"
+}