@tangle-network/agent-eval 0.69.0 → 0.70.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +2 -2
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +2 -2
- package/dist/{chunk-E24XD7A2.js → chunk-GYELOWB6.js} +2 -2
- package/dist/{chunk-JFGZPUMU.js → chunk-ZZCQQHW7.js} +26 -6
- package/dist/chunk-ZZCQQHW7.js.map +1 -0
- package/dist/contract/index.d.ts +8 -8
- package/dist/contract/index.js +2 -2
- package/dist/hosted/index.d.ts +2 -2
- package/dist/{index-DSEHMwvS.d.ts → index-BGBrVS24.d.ts} +1 -1
- package/dist/index.d.ts +7 -2
- package/dist/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-CChUqexv.d.ts → provenance-C69gLUXH.d.ts} +3 -3
- package/dist/rl.d.ts +1 -1
- package/dist/{run-improvement-loop-BKpM5T4t.d.ts → run-improvement-loop-Bzamo6GB.d.ts} +1 -1
- package/dist/{types-c2R2kfmv.d.ts → types-CnmZ2bkP.d.ts} +7 -1
- package/package.json +1 -1
- package/dist/chunk-JFGZPUMU.js.map +0 -1
- /package/dist/{chunk-E24XD7A2.js.map → chunk-GYELOWB6.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,17 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
+
## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
|
|
8
|
+
|
|
9
|
+
Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- **Judge `notes` now reach the reflective driver.** `campaignBreakdown` collects each scenario's judge `notes` (deduped) into `scenarios[].notes`; `GenerationCandidate.scenarios` + `CampaignBreakdown.scenarios` carry it; `gepaDriver`'s `buildEvidence` surfaces it as `TrialTrace.failureNote`; `buildReflectionPrompt` renders a **"Why it scored low"** block per bottom trial. The optimizer now grounds its next edit on the actual failure pattern.
|
|
14
|
+
- **Anti-overfit by contract + by construction.** The `notes` are documented as GENERALIZABLE failure patterns (which checks/lines/dimensions failed, and how) — NOT case-specific ground truth; leaking expected answers would be memorization. And the held-out gate is the structural backstop: a candidate that overfits train cannot clear the paired-bootstrap CI on cases the driver never saw.
|
|
15
|
+
|
|
16
|
+
Generic — any agent benefits by having its judge emit informative `notes`. 3 new tests (notes surfaced + deduped + rendered into the reflection); full suite (1645) green.
|
|
17
|
+
|
|
7
18
|
## [0.69.0] — 2026-05-30 — strong generic baseline roles (engineer / researcher / generalist)
|
|
8
19
|
|
|
9
20
|
The structured profile (0.68.0) had a hollow top zone — `baselineProfile` took an arbitrary `role` string. Products are file-producing, tool-using agents living in a sandbox, but nothing gave them a strong operator foundation. This adds three generically-useful, verification-first baseline roles distilled from agent-runtime's `coderProfile` doctrine.
|
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-CnmZ2bkP.js';
|
|
2
2
|
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-CnmZ2bkP.js';
|
|
2
2
|
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TraceSpanEvent, H as HostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
1
|
+
import { T as TraceSpanEvent, H as HostedClient } from '../index-BGBrVS24.js';
|
|
2
|
+
import '../types-CnmZ2bkP.js';
|
|
3
3
|
import '../run-record-BgTFzO2r.js';
|
|
4
4
|
import '../errors-Dwqw-T_m.js';
|
|
5
5
|
import '../schema-m0gsnbt3.js';
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-
|
|
2
|
-
export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-
|
|
3
|
-
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-
|
|
1
|
+
import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-Bzamo6GB.js';
|
|
2
|
+
export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bzamo6GB.js';
|
|
3
|
+
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-C69gLUXH.js';
|
|
4
4
|
import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
|
|
5
|
-
import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-
|
|
6
|
-
export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-
|
|
5
|
+
import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-CnmZ2bkP.js';
|
|
6
|
+
export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-CnmZ2bkP.js';
|
|
7
7
|
import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
|
|
8
8
|
import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
|
|
9
9
|
import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
|
|
@@ -12,7 +12,7 @@ import '../red-team-DW9Ca_tj.js';
|
|
|
12
12
|
import '../dataset-B2kL-fSM.js';
|
|
13
13
|
import '../store-CKUAgsJz.js';
|
|
14
14
|
import '../schema-m0gsnbt3.js';
|
|
15
|
-
import '../index-
|
|
15
|
+
import '../index-BGBrVS24.js';
|
|
16
16
|
import '../summary-report-ByiOUrHj.js';
|
|
17
17
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
18
18
|
import '../judge-calibration-DilmB3Ml.js';
|
|
@@ -686,10 +686,12 @@ declare function campaignMeanComposite<TArtifact, TScenario extends Scenario>(ca
|
|
|
686
686
|
interface CampaignBreakdown {
|
|
687
687
|
/** Mean score per judge dimension across all cells. */
|
|
688
688
|
dimensions: Record<string, number>;
|
|
689
|
-
/** Per-scenario composite (mean over reps + judges)
|
|
689
|
+
/** Per-scenario composite (mean over reps + judges) + the judge's free-form
|
|
690
|
+
* `notes` for that scenario (the "why" a reflective driver grounds on). */
|
|
690
691
|
scenarios: Array<{
|
|
691
692
|
scenarioId: string;
|
|
692
693
|
composite: number;
|
|
694
|
+
notes?: string;
|
|
693
695
|
}>;
|
|
694
696
|
}
|
|
695
697
|
/** Per-candidate evidence a reflective/patch driver grounds its next proposal
|
package/dist/campaign/index.js
CHANGED
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
heldoutSignificance,
|
|
8
8
|
pairHoldout,
|
|
9
9
|
runEval
|
|
10
|
-
} from "../chunk-
|
|
10
|
+
} from "../chunk-GYELOWB6.js";
|
|
11
11
|
import {
|
|
12
12
|
agentProfileHash
|
|
13
13
|
} from "../chunk-PQV2TKC3.js";
|
|
@@ -31,7 +31,7 @@ import {
|
|
|
31
31
|
runOptimization,
|
|
32
32
|
surfaceContentHash,
|
|
33
33
|
surfaceHash
|
|
34
|
-
} from "../chunk-
|
|
34
|
+
} from "../chunk-ZZCQQHW7.js";
|
|
35
35
|
import {
|
|
36
36
|
assertRealBackend,
|
|
37
37
|
fsCampaignStorage,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCanaries,
|
|
3
3
|
scoreRedTeamOutput
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-ZZCQQHW7.js";
|
|
5
5
|
import {
|
|
6
6
|
runCampaign
|
|
7
7
|
} from "./chunk-6XQIEUQ2.js";
|
|
@@ -315,4 +315,4 @@ export {
|
|
|
315
315
|
defaultProductionGate,
|
|
316
316
|
runEval
|
|
317
317
|
};
|
|
318
|
-
//# sourceMappingURL=chunk-
|
|
318
|
+
//# sourceMappingURL=chunk-GYELOWB6.js.map
|
|
@@ -671,6 +671,10 @@ function buildReflectionPrompt(ctx) {
|
|
|
671
671
|
sections.push(
|
|
672
672
|
`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
|
|
673
673
|
);
|
|
674
|
+
if (trial.failureNote) {
|
|
675
|
+
sections.push("");
|
|
676
|
+
sections.push(`**Why it scored low:** ${truncate(trial.failureNote, 600)}`);
|
|
677
|
+
}
|
|
674
678
|
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
675
679
|
if (missed.length > 0) {
|
|
676
680
|
sections.push("");
|
|
@@ -986,7 +990,10 @@ function buildEvidence(ctx, evidenceK, baseTarget) {
|
|
|
986
990
|
const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
|
|
987
991
|
const toTrace = (s) => ({
|
|
988
992
|
id: s.scenarioId,
|
|
989
|
-
score: s.composite
|
|
993
|
+
score: s.composite,
|
|
994
|
+
// The judge's "why it scored low" — grounds the reflection on real failure
|
|
995
|
+
// patterns instead of blind rephrasing. Generalizable by the judge contract.
|
|
996
|
+
...s.notes ? { failureNote: s.notes } : {}
|
|
990
997
|
});
|
|
991
998
|
const top = byScore.slice(0, evidenceK).map(toTrace);
|
|
992
999
|
const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
|
|
@@ -1156,6 +1163,7 @@ function campaignBreakdown(campaign) {
|
|
|
1156
1163
|
const dimSums = {};
|
|
1157
1164
|
const dimCounts = {};
|
|
1158
1165
|
const byScenario = /* @__PURE__ */ new Map();
|
|
1166
|
+
const notesByScenario = /* @__PURE__ */ new Map();
|
|
1159
1167
|
for (const cell of campaign.cells) {
|
|
1160
1168
|
const judgeScores = Object.values(cell.judgeScores);
|
|
1161
1169
|
if (judgeScores.length === 0) continue;
|
|
@@ -1163,6 +1171,13 @@ function campaignBreakdown(campaign) {
|
|
|
1163
1171
|
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
1164
1172
|
arr.push(cellComposite);
|
|
1165
1173
|
byScenario.set(cell.scenarioId, arr);
|
|
1174
|
+
for (const s of judgeScores) {
|
|
1175
|
+
if (s.notes && s.notes.trim()) {
|
|
1176
|
+
const set = notesByScenario.get(cell.scenarioId) ?? /* @__PURE__ */ new Set();
|
|
1177
|
+
set.add(s.notes.trim());
|
|
1178
|
+
notesByScenario.set(cell.scenarioId, set);
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1166
1181
|
for (const score of judgeScores) {
|
|
1167
1182
|
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
1168
1183
|
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
@@ -1175,10 +1190,15 @@ function campaignBreakdown(campaign) {
|
|
|
1175
1190
|
const count = dimCounts[key] ?? 0;
|
|
1176
1191
|
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
1177
1192
|
}
|
|
1178
|
-
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) =>
|
|
1179
|
-
scenarioId
|
|
1180
|
-
|
|
1181
|
-
|
|
1193
|
+
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => {
|
|
1194
|
+
const notesSet = notesByScenario.get(scenarioId);
|
|
1195
|
+
const notes = notesSet && notesSet.size > 0 ? [...notesSet].join(" | ") : void 0;
|
|
1196
|
+
return {
|
|
1197
|
+
scenarioId,
|
|
1198
|
+
composite: comps.reduce((a, b) => a + b, 0) / comps.length,
|
|
1199
|
+
...notes ? { notes } : {}
|
|
1200
|
+
};
|
|
1201
|
+
});
|
|
1182
1202
|
return { dimensions, scenarios };
|
|
1183
1203
|
}
|
|
1184
1204
|
|
|
@@ -1764,4 +1784,4 @@ export {
|
|
|
1764
1784
|
provenanceSpansPath,
|
|
1765
1785
|
emitLoopProvenance
|
|
1766
1786
|
};
|
|
1767
|
-
//# sourceMappingURL=chunk-
|
|
1787
|
+
//# sourceMappingURL=chunk-ZZCQQHW7.js.map
|