npm - @tangle-network/agent-eval - Versions diffs - 0.59.1 → 0.60.0 - Mend

@tangle-network/agent-eval 0.59.1 → 0.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/http.js +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/langchain.js +1 -1
package/dist/adapters/otel.d.ts +2 -2
package/dist/adapters/otel.js +1 -1
package/dist/benchmarks/index.js +2 -2
package/dist/builder-eval/index.js +1 -1
package/dist/campaign/index.d.ts +7 -3
package/dist/campaign/index.js +21 -16
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
package/dist/chunk-GBHRUAOF.js.map +1 -0
package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
package/dist/chunk-LBSXXH56.js.map +1 -0
package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
package/dist/chunk-PZ5AY32C.js +10 -0
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/contract/index.d.ts +42 -10
package/dist/contract/index.js +55 -15
package/dist/contract/index.js.map +1 -1
package/dist/control.js +1 -1
package/dist/governance/index.js +1 -1
package/dist/hosted/index.d.ts +2 -2
package/dist/hosted/index.js +1 -1
package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
package/dist/index.js +8 -8
package/dist/knowledge/index.js +1 -1
package/dist/matrix/index.js +1 -1
package/dist/meta-eval/index.js +1 -1
package/dist/multishot/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +1 -1
package/dist/prm/index.js +1 -1
package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
package/dist/reporting.js +1 -1
package/dist/rl.d.ts +1 -1
package/dist/rl.js +1 -1
package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
package/dist/telemetry/file.js +1 -1
package/dist/telemetry/index.js +1 -1
package/dist/traces.js +1 -1
package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
package/dist/wire/index.js +1 -1
package/package.json +25 -12
package/dist/chunk-JB4UWIM6.js.map +0 -1
package/dist/chunk-N4SBKEPJ.js.map +0 -1
package/dist/chunk-NSBPE2FW.js +0 -17
package/dist/chunk-ZWEQJIM6.js +0 -220
package/dist/chunk-ZWEQJIM6.js.map +0 -1
/package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
/package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
/package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0

package/dist/contract/index.d.ts CHANGED Viewed

@@ -1,13 +1,13 @@
-import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-BgrxOJSf.js';
-export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, w as SessionScript } from '../types-BgrxOJSf.js';
-import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-BhfdjrMY.js';
-export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-BhfdjrMY.js';
+import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-VCIXx_yo.js';
+export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, x as SessionScript } from '../types-VCIXx_yo.js';
+import { C as CampaignStorage, e as LoopProvenanceRecord, i as RunImprovementLoopResult } from '../provenance-BM8vmMBa.js';
+export { D as DefaultProductionGateOptions, b as EvolutionaryDriverOptions, c as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, m as composeGate, o as defaultProductionGate, r as evolutionaryDriver, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, F as runCampaign, I as runEval, J as runImprovementLoop } from '../provenance-BM8vmMBa.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
-import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-D2nT6_KT.js';
-export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-D2nT6_KT.js';
+import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-BIkvdkSU.js';
+export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-BIkvdkSU.js';
+import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
 import { A as AnalystRegistry } from '../registry-DK9kqXvb.js';
 import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
-import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
 import '../llm-client-BXVRUZyX.js';
 import '../errors-mje_cKOs.js';
 import '../raw-provider-sink-C46HDghv.js';
@@ -131,12 +131,30 @@ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
     /** LLM config consumed by the default `gepaDriver`. Ignored if you pass
      *  your own `driver`. */
     llm?: SelfImproveLlm;
-    /** Storage backend. Default `inMemoryCampaignStorage()` — nothing
-     *  persists past the call. Pass `fsCampaignStorage()` to write to disk. */
+    /** Storage backend. Default is DURABLE: when a real (non-`mem://`) `runDir`
+     *  is available, the substrate defaults to `fsCampaignStorage()` so the
+     *  provenance record + OTel spans survive the call. Pass
+     *  `inMemoryCampaignStorage()` explicitly to opt OUT (tests, edge runtimes).
+     *  Default when `runDir` is `mem://...` (or unset): in-memory. */
     storage?: CampaignStorage;
     /** Run directory (logical for in-memory storage, real path for fs).
-     *  Default `mem://selfImprove-<timestamp>`. */
+     *  Default `mem://selfImprove-<timestamp>` (in-memory, non-durable). Pass a
+     *  real path to persist the provenance record + spans. */
     runDir?: string;
+    /**
+     * Worker call records for backend provenance. The agent is opaque to the
+     * substrate (it returns an artifact, not token usage), so to capture an
+     * `assertRealBackend`-grade verdict + worker call count + model in the
+     * provenance record, the agent reports its per-call `RunRecord`s here.
+     * Called once after the loop; return the records the agent accumulated.
+     * When unset, backend provenance is derived from campaign cells (cost only;
+     * verdict will read `stub` without token usage — the honest signal that no
+     * token channel was wired).
+     */
+    collectWorkerRecords?: () => RunRecord[];
+    /** Fires once the durable provenance record + OTel spans are emitted.
+     *  Receives the structured record for inline assertions / custom routing. */
+    onProvenance?: (record: LoopProvenanceRecord) => void;
     /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
      *  Returns an opaque placement key the substrate forwards to your agent
      *  as `ctx.placement`. Combined with `httpDispatch` from
@@ -184,10 +202,24 @@ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
         compositeMean: number;
         perScenario: Record<string, number>;
         surface: MutableSurface;
+        /** Driver label for the promoted change. Absent ⇒ winner == baseline or
+         *  a bare-surface mutator. */
+        label?: string;
+        /** Driver rationale — the "because Z" that motivated the promoted change.
+         *  Threaded from the driver's `ProposedCandidate` through the loop.
+         *  Absent ⇒ winner == baseline. */
+        rationale?: string;
     };
     /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
      *  means the gate observed improvement. */
     lift: number;
+    /** The explicit baseline→winner unified diff. Always present (empty string
+     *  when winner == baseline). */
+    diff: string;
+    /** Durable, queryable provenance record: candidate→cell→gate→promote chain +
+     *  rationale + diff + backend provenance. The artifact the hosted ingest
+     *  path stores; the +lift RECOMPUTES from `record.heldOutLift`. */
+    provenance: LoopProvenanceRecord;
     /** `defaultProductionGate.decide()` result. */
     gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
     /** Number of generations actually explored (may be less than the

package/dist/contract/index.js CHANGED Viewed

@@ -1,25 +1,28 @@
 import {
   composeGate,
   defaultProductionGate,
+  emitLoopProvenance,
   evolutionaryDriver,
   gepaDriver,
   heldOutGate,
   runEval,
-  runImprovementLoop
-} from "../chunk-JB4UWIM6.js";
+  runImprovementLoop,
+  surfaceContentHash
+} from "../chunk-LBSXXH56.js";
 import {
   fsCampaignStorage,
   inMemoryCampaignStorage,
   runCampaign
-} from "../chunk-74Y2EMNH.js";
+} from "../chunk-NOPYCRNG.js";
 import {
   createHostedClient
 } from "../chunk-FQK2CCIM.js";
 import {
-  checkCanaries,
+  checkCanaries
+} from "../chunk-SHTXZ4O2.js";
+import {
   summarizeBackendIntegrity
-} from "../chunk-ZWEQJIM6.js";
-import "../chunk-N4SBKEPJ.js";
+} from "../chunk-GBHRUAOF.js";
 import "../chunk-YV7J7X5N.js";
 import {
   FileSystemOutcomeStore,
@@ -42,7 +45,10 @@ import "../chunk-VSMTAMNK.js";
 import "../chunk-VXNVVBZO.js";
 import "../chunk-PC4UYEBM.js";
 import "../chunk-QYJT52YW.js";
-import "../chunk-NSBPE2FW.js";
+import "../chunk-PZ5AY32C.js";
+// src/contract/self-improve.ts
+import { createHash } from "crypto";
 // src/contract/analyze-runs.ts
 async function analyzeRuns(opts) {
@@ -848,8 +854,9 @@ async function selfImprove(opts) {
     holdoutScenarios: holdout,
     deltaThreshold: 0.05
   });
-  const storage = opts.storage ?? inMemoryCampaignStorage();
   const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
+  const isMemRunDir = runDir.startsWith("mem://");
+  const storage = opts.storage ?? (isMemRunDir ? inMemoryCampaignStorage() : fsCampaignStorage());
   if (opts.onProgress) {
     opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
   }
@@ -892,22 +899,53 @@ async function selfImprove(opts) {
   );
   const insight = await analyzeRuns({
     runs: [
-      ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir),
-      ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir)
+      ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir, opts.baselineSurface),
+      ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface)
     ],
     baselineCandidateId: "baseline",
     candidateCandidateId: "winner"
   });
+  const durationMs = Date.now() - startedAt;
+  const workerRecords = opts.collectWorkerRecords?.() ?? cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface);
+  const { record: provenance } = await emitLoopProvenance({
+    runId: `${runDir}#${startedAt}`,
+    runDir,
+    timestamp: new Date(startedAt).toISOString(),
+    baselineSurface: opts.baselineSurface,
+    winnerSurface: result.winnerSurface,
+    winnerLabel: result.winnerLabel,
+    winnerRationale: result.winnerRationale,
+    diff: result.promotedDiff,
+    generations: result.generations.map((g) => ({
+      generationIndex: g.record.generationIndex,
+      candidates: g.record.candidates,
+      promoted: g.record.promoted,
+      surfaces: g.surfaces.map((s) => ({ surfaceHash: s.surfaceHash, surface: s.surface }))
+    })),
+    gate: result.gateResult,
+    baselineOnHoldout: result.baselineOnHoldout,
+    winnerOnHoldout: result.winnerOnHoldout,
+    workerRecords,
+    totalCostUsd: totalCost,
+    totalDurationMs: durationMs,
+    storage,
+    hostedClient: opts.hostedTenant ? createHostedClient(opts.hostedTenant) : void 0
+  });
+  if (opts.onProvenance) opts.onProvenance(provenance);
   const summary = {
     baseline,
     winner: {
       ...winnerStats,
-      surface: result.winnerSurface
+      surface: result.winnerSurface,
+      ...result.winnerLabel ? { label: result.winnerLabel } : {},
+      ...result.winnerRationale ? { rationale: result.winnerRationale } : {}
     },
     lift: winnerStats.compositeMean - baseline.compositeMean,
+    diff: result.promotedDiff,
+    provenance,
     gateDecision: result.gateResult.decision,
     generationsExplored: result.generations.length,
-    durationMs: Date.now() - startedAt,
+    durationMs,
     totalCostUsd: totalCost,
     insight,
     raw: result
@@ -989,7 +1027,9 @@ function hashString(s) {
   }
   return h.toString(16).padStart(8, "0");
 }
-function cellsToRunRecords(cells, candidateId, runId) {
+function cellsToRunRecords(cells, candidateId, runId, surface) {
+  const promptHash = surfaceContentHash(surface);
+  const configHash = `sha256:${createHash("sha256").update(candidateId).digest("hex")}`;
   return cells.map((cell) => {
     const perJudge = {};
     const perDimMeanAccum = {};
@@ -1027,8 +1067,8 @@ function cellsToRunRecords(cells, candidateId, runId) {
       // Synthesize a stable seed for that pairing.
       seed: cell.rep * 1e6 + hashString(cell.scenarioId).slice(0, 6).split("").reduce((a, c) => a * 31 + c.charCodeAt(0) >>> 0, 0),
       model: "campaign-cell",
-      promptHash: "sha256:cell",
-      configHash: "sha256:cell",
+      promptHash,
+      configHash,
       commitSha: "cell",
       wallMs: cell.durationMs,
       costUsd: cell.costUsd,