@tangle-network/agent-eval 0.69.0 → 0.70.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -4,6 +4,17 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
4
4
 
5
5
  ---
6
6
 
7
+ ## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
8
+
9
+ Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
10
+
11
+ ### Fixed
12
+
13
+ - **Judge `notes` now reach the reflective driver.** `campaignBreakdown` collects each scenario's judge `notes` (deduped) into `scenarios[].notes`; `GenerationCandidate.scenarios` + `CampaignBreakdown.scenarios` carry it; `gepaDriver`'s `buildEvidence` surfaces it as `TrialTrace.failureNote`; `buildReflectionPrompt` renders a **"Why it scored low"** block per bottom trial. The optimizer now grounds its next edit on the actual failure pattern.
14
+ - **Anti-overfit by contract + by construction.** The `notes` are documented as GENERALIZABLE failure patterns (which checks/lines/dimensions failed, and how) — NOT case-specific ground truth; leaking expected answers would be memorization. And the held-out gate is the structural backstop: a candidate that overfits train cannot clear the paired-bootstrap CI on cases the driver never saw.
15
+
16
+ Generic — any agent benefits by having its judge emit informative `notes`. 3 new tests (notes surfaced + deduped + rendered into the reflection); full suite (1645) green.
17
+
7
18
  ## [0.69.0] — 2026-05-30 — strong generic baseline roles (engineer / researcher / generalist)
8
19
 
9
20
  The structured profile (0.68.0) had a hollow top zone — `baselineProfile` took an arbitrary `role` string. Products are file-producing, tool-using agents living in a sandbox, but nothing gave them a strong operator foundation. This adds three generically-useful, verification-first baseline roles distilled from agent-runtime's `coderProfile` doctrine.
@@ -1,4 +1,4 @@
1
- import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-c2R2kfmv.js';
1
+ import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-CnmZ2bkP.js';
2
2
  import '../run-record-BgTFzO2r.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -1,4 +1,4 @@
1
- import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-c2R2kfmv.js';
1
+ import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-CnmZ2bkP.js';
2
2
  import '../run-record-BgTFzO2r.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -1,5 +1,5 @@
1
- import { T as TraceSpanEvent, H as HostedClient } from '../index-DSEHMwvS.js';
2
- import '../types-c2R2kfmv.js';
1
+ import { T as TraceSpanEvent, H as HostedClient } from '../index-BGBrVS24.js';
2
+ import '../types-CnmZ2bkP.js';
3
3
  import '../run-record-BgTFzO2r.js';
4
4
  import '../errors-Dwqw-T_m.js';
5
5
  import '../schema-m0gsnbt3.js';
@@ -1,9 +1,9 @@
1
- import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-BKpM5T4t.js';
2
- export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BKpM5T4t.js';
3
- export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-CChUqexv.js';
1
+ import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-Bzamo6GB.js';
2
+ export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bzamo6GB.js';
3
+ export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-C69gLUXH.js';
4
4
  import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
5
- import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
6
- export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
5
+ import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-CnmZ2bkP.js';
6
+ export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-CnmZ2bkP.js';
7
7
  import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
8
8
  import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
9
9
  import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
@@ -12,7 +12,7 @@ import '../red-team-DW9Ca_tj.js';
12
12
  import '../dataset-B2kL-fSM.js';
13
13
  import '../store-CKUAgsJz.js';
14
14
  import '../schema-m0gsnbt3.js';
15
- import '../index-DSEHMwvS.js';
15
+ import '../index-BGBrVS24.js';
16
16
  import '../summary-report-ByiOUrHj.js';
17
17
  import '../failure-cluster-CL7IVgkJ.js';
18
18
  import '../judge-calibration-DilmB3Ml.js';
@@ -686,10 +686,12 @@ declare function campaignMeanComposite<TArtifact, TScenario extends Scenario>(ca
686
686
  interface CampaignBreakdown {
687
687
  /** Mean score per judge dimension across all cells. */
688
688
  dimensions: Record<string, number>;
689
- /** Per-scenario composite (mean over reps + judges). */
689
+ /** Per-scenario composite (mean over reps + judges) + the judge's free-form
690
+ * `notes` for that scenario (the "why" a reflective driver grounds on). */
690
691
  scenarios: Array<{
691
692
  scenarioId: string;
692
693
  composite: number;
694
+ notes?: string;
693
695
  }>;
694
696
  }
695
697
  /** Per-candidate evidence a reflective/patch driver grounds its next proposal
@@ -7,7 +7,7 @@ import {
7
7
  heldoutSignificance,
8
8
  pairHoldout,
9
9
  runEval
10
- } from "../chunk-E24XD7A2.js";
10
+ } from "../chunk-GYELOWB6.js";
11
11
  import {
12
12
  agentProfileHash
13
13
  } from "../chunk-PQV2TKC3.js";
@@ -31,7 +31,7 @@ import {
31
31
  runOptimization,
32
32
  surfaceContentHash,
33
33
  surfaceHash
34
- } from "../chunk-JFGZPUMU.js";
34
+ } from "../chunk-ZZCQQHW7.js";
35
35
  import {
36
36
  assertRealBackend,
37
37
  fsCampaignStorage,
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  runCanaries,
3
3
  scoreRedTeamOutput
4
- } from "./chunk-JFGZPUMU.js";
4
+ } from "./chunk-ZZCQQHW7.js";
5
5
  import {
6
6
  runCampaign
7
7
  } from "./chunk-6XQIEUQ2.js";
@@ -315,4 +315,4 @@ export {
315
315
  defaultProductionGate,
316
316
  runEval
317
317
  };
318
- //# sourceMappingURL=chunk-E24XD7A2.js.map
318
+ //# sourceMappingURL=chunk-GYELOWB6.js.map
@@ -671,6 +671,10 @@ function buildReflectionPrompt(ctx) {
671
671
  sections.push(
672
672
  `### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
673
673
  );
674
+ if (trial.failureNote) {
675
+ sections.push("");
676
+ sections.push(`**Why it scored low:** ${truncate(trial.failureNote, 600)}`);
677
+ }
674
678
  const missed = (trial.expectations ?? []).filter((e) => !e.matched);
675
679
  if (missed.length > 0) {
676
680
  sections.push("");
@@ -986,7 +990,10 @@ function buildEvidence(ctx, evidenceK, baseTarget) {
986
990
  const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
987
991
  const toTrace = (s) => ({
988
992
  id: s.scenarioId,
989
- score: s.composite
993
+ score: s.composite,
994
+ // The judge's "why it scored low" — grounds the reflection on real failure
995
+ // patterns instead of blind rephrasing. Generalizable by the judge contract.
996
+ ...s.notes ? { failureNote: s.notes } : {}
990
997
  });
991
998
  const top = byScore.slice(0, evidenceK).map(toTrace);
992
999
  const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
@@ -1156,6 +1163,7 @@ function campaignBreakdown(campaign) {
1156
1163
  const dimSums = {};
1157
1164
  const dimCounts = {};
1158
1165
  const byScenario = /* @__PURE__ */ new Map();
1166
+ const notesByScenario = /* @__PURE__ */ new Map();
1159
1167
  for (const cell of campaign.cells) {
1160
1168
  const judgeScores = Object.values(cell.judgeScores);
1161
1169
  if (judgeScores.length === 0) continue;
@@ -1163,6 +1171,13 @@ function campaignBreakdown(campaign) {
1163
1171
  const arr = byScenario.get(cell.scenarioId) ?? [];
1164
1172
  arr.push(cellComposite);
1165
1173
  byScenario.set(cell.scenarioId, arr);
1174
+ for (const s of judgeScores) {
1175
+ if (s.notes && s.notes.trim()) {
1176
+ const set = notesByScenario.get(cell.scenarioId) ?? /* @__PURE__ */ new Set();
1177
+ set.add(s.notes.trim());
1178
+ notesByScenario.set(cell.scenarioId, set);
1179
+ }
1180
+ }
1166
1181
  for (const score of judgeScores) {
1167
1182
  for (const [key, value] of Object.entries(score.dimensions)) {
1168
1183
  dimSums[key] = (dimSums[key] ?? 0) + value;
@@ -1175,10 +1190,15 @@ function campaignBreakdown(campaign) {
1175
1190
  const count = dimCounts[key] ?? 0;
1176
1191
  dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
1177
1192
  }
1178
- const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
1179
- scenarioId,
1180
- composite: comps.reduce((a, b) => a + b, 0) / comps.length
1181
- }));
1193
+ const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => {
1194
+ const notesSet = notesByScenario.get(scenarioId);
1195
+ const notes = notesSet && notesSet.size > 0 ? [...notesSet].join(" | ") : void 0;
1196
+ return {
1197
+ scenarioId,
1198
+ composite: comps.reduce((a, b) => a + b, 0) / comps.length,
1199
+ ...notes ? { notes } : {}
1200
+ };
1201
+ });
1182
1202
  return { dimensions, scenarios };
1183
1203
  }
1184
1204
 
@@ -1764,4 +1784,4 @@ export {
1764
1784
  provenanceSpansPath,
1765
1785
  emitLoopProvenance
1766
1786
  };
1767
- //# sourceMappingURL=chunk-JFGZPUMU.js.map
1787
+ //# sourceMappingURL=chunk-ZZCQQHW7.js.map