npm - @tangle-network/agent-eval - Versions diffs - 0.69.0 → 0.70.0 - Mend

@tangle-network/agent-eval 0.69.0 → 0.70.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +11 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +2 -2
package/dist/campaign/index.d.ts +9 -7
package/dist/campaign/index.js +2 -2
package/dist/{chunk-E24XD7A2.js → chunk-GYELOWB6.js} +2 -2
package/dist/{chunk-JFGZPUMU.js → chunk-ZZCQQHW7.js} +26 -6
package/dist/chunk-ZZCQQHW7.js.map +1 -0
package/dist/contract/index.d.ts +8 -8
package/dist/contract/index.js +2 -2
package/dist/hosted/index.d.ts +2 -2
package/dist/{index-DSEHMwvS.d.ts → index-BGBrVS24.d.ts} +1 -1
package/dist/index.d.ts +7 -2
package/dist/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/{provenance-CChUqexv.d.ts → provenance-C69gLUXH.d.ts} +3 -3
package/dist/rl.d.ts +1 -1
package/dist/{run-improvement-loop-BKpM5T4t.d.ts → run-improvement-loop-Bzamo6GB.d.ts} +1 -1
package/dist/{types-c2R2kfmv.d.ts → types-CnmZ2bkP.d.ts} +7 -1
package/package.json +1 -1
package/dist/chunk-JFGZPUMU.js.map +0 -1
/package/dist/{chunk-E24XD7A2.js.map → chunk-GYELOWB6.js.map} +0 -0

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,17 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 ---
+## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
+Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
+### Fixed
+- **Judge `notes` now reach the reflective driver.** `campaignBreakdown` collects each scenario's judge `notes` (deduped) into `scenarios[].notes`; `GenerationCandidate.scenarios` + `CampaignBreakdown.scenarios` carry it; `gepaDriver`'s `buildEvidence` surfaces it as `TrialTrace.failureNote`; `buildReflectionPrompt` renders a **"Why it scored low"** block per bottom trial. The optimizer now grounds its next edit on the actual failure pattern.
+- **Anti-overfit by contract + by construction.** The `notes` are documented as GENERALIZABLE failure patterns (which checks/lines/dimensions failed, and how) — NOT case-specific ground truth; leaking expected answers would be memorization. And the held-out gate is the structural backstop: a candidate that overfits train cannot clear the paired-bootstrap CI on cases the driver never saw.
+Generic — any agent benefits by having its judge emit informative `notes`. 3 new tests (notes surfaced + deduped + rendered into the reflection); full suite (1645) green.
 ## [0.69.0] — 2026-05-30 — strong generic baseline roles (engineer / researcher / generalist)
 The structured profile (0.68.0) had a hollow top zone — `baselineProfile` took an arbitrary `role` string. Products are file-producing, tool-using agents living in a sandbox, but nothing gave them a strong operator foundation. This adds three generically-useful, verification-first baseline roles distilled from agent-runtime's `coderProfile` doctrine.

package/dist/adapters/http.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-c2R2kfmv.js';
+import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-CnmZ2bkP.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/adapters/langchain.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-c2R2kfmv.js';
+import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-CnmZ2bkP.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/adapters/otel.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { T as TraceSpanEvent, H as HostedClient } from '../index-DSEHMwvS.js';
-import '../types-c2R2kfmv.js';
+import { T as TraceSpanEvent, H as HostedClient } from '../index-BGBrVS24.js';
+import '../types-CnmZ2bkP.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-BKpM5T4t.js';
-export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BKpM5T4t.js';
-export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-CChUqexv.js';
+import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-Bzamo6GB.js';
+export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bzamo6GB.js';
+export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-C69gLUXH.js';
 import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
-import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
-export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
+import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-CnmZ2bkP.js';
+export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-CnmZ2bkP.js';
 import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
 import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
 import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
@@ -12,7 +12,7 @@ import '../red-team-DW9Ca_tj.js';
 import '../dataset-B2kL-fSM.js';
 import '../store-CKUAgsJz.js';
 import '../schema-m0gsnbt3.js';
-import '../index-DSEHMwvS.js';
+import '../index-BGBrVS24.js';
 import '../summary-report-ByiOUrHj.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../judge-calibration-DilmB3Ml.js';
@@ -686,10 +686,12 @@ declare function campaignMeanComposite<TArtifact, TScenario extends Scenario>(ca
 interface CampaignBreakdown {
     /** Mean score per judge dimension across all cells. */
     dimensions: Record<string, number>;
-    /** Per-scenario composite (mean over reps + judges). */
+    /** Per-scenario composite (mean over reps + judges) + the judge's free-form
+     *  `notes` for that scenario (the "why" a reflective driver grounds on). */
     scenarios: Array<{
         scenarioId: string;
         composite: number;
+        notes?: string;
     }>;
 }
 /** Per-candidate evidence a reflective/patch driver grounds its next proposal

package/dist/campaign/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import {
   heldoutSignificance,
   pairHoldout,
   runEval
-} from "../chunk-E24XD7A2.js";
+} from "../chunk-GYELOWB6.js";
 import {
   agentProfileHash
 } from "../chunk-PQV2TKC3.js";
@@ -31,7 +31,7 @@ import {
   runOptimization,
   surfaceContentHash,
   surfaceHash
-} from "../chunk-JFGZPUMU.js";
+} from "../chunk-ZZCQQHW7.js";
 import {
   assertRealBackend,
   fsCampaignStorage,

package/dist/{chunk-E24XD7A2.js → chunk-GYELOWB6.js} RENAMED Viewed

@@ -1,7 +1,7 @@
 import {
   runCanaries,
   scoreRedTeamOutput
-} from "./chunk-JFGZPUMU.js";
+} from "./chunk-ZZCQQHW7.js";
 import {
   runCampaign
 } from "./chunk-6XQIEUQ2.js";
@@ -315,4 +315,4 @@ export {
   defaultProductionGate,
   runEval
 };
-//# sourceMappingURL=chunk-E24XD7A2.js.map
+//# sourceMappingURL=chunk-GYELOWB6.js.map

package/dist/{chunk-JFGZPUMU.js → chunk-ZZCQQHW7.js} RENAMED Viewed

@@ -671,6 +671,10 @@ function buildReflectionPrompt(ctx) {
       sections.push(
         `### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
       );
+      if (trial.failureNote) {
+        sections.push("");
+        sections.push(`**Why it scored low:** ${truncate(trial.failureNote, 600)}`);
+      }
       const missed = (trial.expectations ?? []).filter((e) => !e.matched);
       if (missed.length > 0) {
         sections.push("");
@@ -986,7 +990,10 @@ function buildEvidence(ctx, evidenceK, baseTarget) {
   const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
   const toTrace = (s) => ({
     id: s.scenarioId,
-    score: s.composite
+    score: s.composite,
+    // The judge's "why it scored low" — grounds the reflection on real failure
+    // patterns instead of blind rephrasing. Generalizable by the judge contract.
+    ...s.notes ? { failureNote: s.notes } : {}
   });
   const top = byScore.slice(0, evidenceK).map(toTrace);
   const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
@@ -1156,6 +1163,7 @@ function campaignBreakdown(campaign) {
   const dimSums = {};
   const dimCounts = {};
   const byScenario = /* @__PURE__ */ new Map();
+  const notesByScenario = /* @__PURE__ */ new Map();
   for (const cell of campaign.cells) {
     const judgeScores = Object.values(cell.judgeScores);
     if (judgeScores.length === 0) continue;
@@ -1163,6 +1171,13 @@ function campaignBreakdown(campaign) {
     const arr = byScenario.get(cell.scenarioId) ?? [];
     arr.push(cellComposite);
     byScenario.set(cell.scenarioId, arr);
+    for (const s of judgeScores) {
+      if (s.notes && s.notes.trim()) {
+        const set = notesByScenario.get(cell.scenarioId) ?? /* @__PURE__ */ new Set();
+        set.add(s.notes.trim());
+        notesByScenario.set(cell.scenarioId, set);
+      }
+    }
     for (const score of judgeScores) {
       for (const [key, value] of Object.entries(score.dimensions)) {
         dimSums[key] = (dimSums[key] ?? 0) + value;
@@ -1175,10 +1190,15 @@ function campaignBreakdown(campaign) {
     const count = dimCounts[key] ?? 0;
     dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
   }
-  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
-    scenarioId,
-    composite: comps.reduce((a, b) => a + b, 0) / comps.length
-  }));
+  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => {
+    const notesSet = notesByScenario.get(scenarioId);
+    const notes = notesSet && notesSet.size > 0 ? [...notesSet].join(" | ") : void 0;
+    return {
+      scenarioId,
+      composite: comps.reduce((a, b) => a + b, 0) / comps.length,
+      ...notes ? { notes } : {}
+    };
+  });
   return { dimensions, scenarios };
 }
@@ -1764,4 +1784,4 @@ export {
   provenanceSpansPath,
   emitLoopProvenance
 };
-//# sourceMappingURL=chunk-JFGZPUMU.js.map
+//# sourceMappingURL=chunk-ZZCQQHW7.js.map