npm - @tangle-network/agent-eval - Versions diffs - 0.66.0 → 0.68.0 - Mend

@tangle-network/agent-eval 0.66.0 → 0.68.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CHANGELOG.md +28 -0
package/dist/campaign/index.d.ts +107 -4
package/dist/campaign/index.js +17 -9
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-E24XD7A2.js +318 -0
package/dist/chunk-E24XD7A2.js.map +1 -0
package/dist/{chunk-Q56RRLEC.js → chunk-JFGZPUMU.js} +289 -5
package/dist/chunk-JFGZPUMU.js.map +1 -0
package/dist/contract/index.d.ts +4 -4
package/dist/contract/index.js +6 -6
package/dist/index.d.ts +120 -11
package/dist/index.js +100 -2
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/{provenance-BZUFC1_D.d.ts → provenance-CChUqexv.d.ts} +23 -1
package/dist/{registry-BzAEvqAt.d.ts → registry-BGKyX6bw.d.ts} +1 -1
package/dist/release-report-CN8hJlhk.d.ts +233 -0
package/dist/reporting.d.ts +4 -3
package/dist/statistics-B7yCbi9i.d.ts +253 -0
package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
package/package.json +1 -1
package/dist/chunk-Q56RRLEC.js.map +0 -1
package/dist/chunk-RDK3P4JE.js +0 -482
package/dist/chunk-RDK3P4JE.js.map +0 -1
package/dist/release-report-DGoeObZT.d.ts +0 -484

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,34 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 ---
+## [0.68.0] — 2026-05-30 — structured AgentProfile (the self-improvement surface stops being an opaque blob)
+The optimizable surface was an opaque string addendum, so the loop could only mutate (and the dashboard only diff) an unstructured blob — you couldn't see *what kind* of improvement a candidate made. This adds a **sectioned `AgentProfile`** primitive (mirrored on Harvey LAB's system-prompt structure) so the surface has named, separately-addressable zones the loop targets one at a time.
+### Added
+- **`profile` namespace** (`import { profile } from '@tangle-network/agent-eval'`):
+  - `AgentProfile { role, environment, toolConventions, skills: ProfileSkill[], domain: AgentProfileSection[] }` — the structured surface. `environment` is a first-class section (the sandbox contract: workspace root, read-only documents, output dir, skills dir), matching how an agentic harness actually addresses its sandbox.
+  - `renderProfile(p)` emits the system prompt in fixed order: role → `## Environment` → `## Tool conventions` → `## Skills` → `## Domain guidance`.
+  - `baselineProfile` / `prodProfile(baseline, shipped)` — baseline = empty domain + stock skills; prod = baseline + gate-certified domain sections.
+  - `applyDomainPatch(p, sectionId, body)` — **section-scoped** edit so the improvement loop optimizes ONE evolvable section, not the whole blob; `profileToSurface(p)` bridges to the existing string `MutableSurface`.
+- Namespaced as `profile.*` to avoid clashing with the benchmark-cell `AgentProfile` already exported from `./agent-profile`.
+Additive — does not touch `runImprovementLoop` or the string surface. 15 tests (zone order; only evolvable sections change hash under `applyDomainPatch`; baseline vs prod differ only in domain/skills; Environment present + non-empty). Full suite (1639) green. First consumers: the TaxCalcBench + Harvey LAB benchmark adapters (tax-agent / legal-agent) that score our agent's profile against public leaderboards.
+## [0.67.0] — 2026-05-30 — the promotion gate is statistically trustworthy (no more shipping noise)
+An adversarial review of a real "ship +4.0 lift" decision found it was a **triple false positive**: the driver's candidate lost on train, so the winner was the baseline (empty diff); the loop re-scored the baseline against ITSELF on the holdout and read run-to-run model noise (91 vs 95) as a "+4 lift"; and a point-estimate gate (`delta >= 0.03` on a 0-100 scale, `reps:1`) shipped it — while the reward-hacking gate was blind to a −30 regression on a safety dimension hiding under the +4 net. The promotion gate could not tell a real improvement from noise or from a Goodhart trade.
+### Fixed / Added
+- **No-op guard** (`runImprovementLoop`) — when the winner is byte-identical to the baseline (no candidate beat the training baseline, empty diff), the loop now forces `hold` and skips the meaningless baseline-vs-itself holdout pass, instead of shipping the noise delta.
+- **Statistical held-out gate** — `defaultProductionGate`'s held-out check is now a **paired bootstrap CI**, not a point estimate. It pairs candidate vs baseline holdout cells by **full `cellId` (`scenario:rep`)** — never averaging reps away — and ships only when the CI lower bound clears `deltaThreshold` (default 0 ⇒ confidently positive). Below `minProductiveRuns` (default 3) paired observations it HOLDS with `few_runs` rather than reading a degenerate interval. (New module `src/campaign/gates/statistical-heldout.ts`; reuses `pairedBootstrap` from `src/statistics.ts`.)
+- **Per-dimension regression guard (anti-Goodhart)** — `criticalDimensions` + `regressionTolerance` on `DefaultProductionGateOptions`. The gate HOLDS if any guarded dimension's paired-delta CI lower bound falls below −tolerance, even when the net composite rose. Tolerance auto-scales (0.05 on [0,1], 5 on 0-100) so a default expressed for one scale isn't a silent no-op on the other.
+- **Exports** `pairHoldout`, `heldoutSignificance`, `dimensionRegressions`, `detectScale` from `/campaign`.
+This collapses the duplicated gate tech-debt (a rigorous `src/held-out-gate.ts` existed but the loop wired the weak adapter) onto the shared `pairedBootstrap` statistics. 12 new regression tests, including the exact noisy-same-mean false positive and the composite-up/dimension-down Goodhart trade. Full suite (1624) green. The remaining path to a *proven* self-improvement (headroom corpus + Goodhart-resistant measurement, driver effectiveness, inter-cycle compounding) is tracked separately.
 ## [0.66.0] — 2026-05-30 — the improvement loop can no longer hang silently or ingest to the wrong URL
 ### Fixed

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-BKpM5T4t.js';
 export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BKpM5T4t.js';
-export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-BZUFC1_D.js';
+export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-CChUqexv.js';
 import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
-import { I as ImprovementDriver, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
-export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
+import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
+export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
+import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
 import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
 import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
 import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
@@ -16,6 +17,8 @@ import '../summary-report-ByiOUrHj.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../judge-calibration-DilmB3Ml.js';
 import '../raw-provider-sink-C46HDghv.js';
+import '../types-Croy5h7V.js';
+import '@tangle-network/tcloud';
 /**
  * @experimental
@@ -164,6 +167,106 @@ declare class SkillPatchParseError extends Error {
 }
 declare function parseSkillPatchResponse(raw: string, maxPatches: number, editBudget: number): SkillPatch[];
+/**
+ * @experimental
+ *
+ * Statistical held-out promotion machinery — the trustworthy core the
+ * point-estimate `heldout-delta` gate lacked.
+ *
+ * The shipped false positive it prevents: a winner re-scored against the
+ * baseline on the holdout read run-to-run model NOISE (e.g. 91 vs 95) as a
+ * "+4 lift" and shipped, because the gate compared point estimates with no
+ * confidence interval. Here we pair candidate vs baseline holdout observations
+ * and bootstrap a CI on the paired delta — a candidate ships only when the CI
+ * lower bound clears the effect-size threshold (the gain is real at the
+ * confidence level, not noise), and is blocked when a critical dimension
+ * (e.g. `hallucination_free` for a legal agent) significantly regresses even if
+ * the net composite rose (anti-Goodhart).
+ *
+ * Two traps this module is built around (both produce a NEW false positive if
+ * gotten wrong):
+ *   1. PAIRING GRANULARITY — pairs by FULL `cellId` (`scenario:rep`), never by
+ *      `scenarioId` (which averages reps away and destroys the within-pair
+ *      variance reduction that makes a paired bootstrap tighter than unpaired).
+ *      One paired observation per cell ⇒ reps multiply n.
+ *   2. SCALE — a judge may emit composites/dimensions on [0,1] or 0-100. The
+ *      threshold + tolerance are interpreted in the judge's NATIVE scale; the
+ *      per-dimension tolerance auto-scales off the observed baseline magnitudes
+ *      so `-0.10` on [0,1] doesn't silently become a no-op on a 0-100 dimension.
+ */
+interface PairedHoldout {
+    /** Baseline scalar per paired cell (same order as `after`/`cellIds`). */
+    before: number[];
+    /** Candidate scalar per paired cell. */
+    after: number[];
+    /** The full cellIds (`scenario:rep`) that paired, in order. */
+    cellIds: string[];
+}
+/**
+ * Pair candidate vs baseline holdout observations by FULL cellId. `select`
+ * pulls the scalar from a cell's judge reports (composite, or a named
+ * dimension); a cell contributes the mean of `select` across its judges. Cells
+ * whose scenario is not in `scenarioIds`, or where `select` is undefined for
+ * every judge on either side, are skipped on BOTH sides so the arrays stay
+ * paired. Throws when the two maps disagree on which holdout cells exist — a
+ * load-bearing invariant: the baseline + winner holdout campaigns run the same
+ * scenarios with the same seed base, so their cellIds MUST align; a mismatch
+ * means a silent pairing bug, not a soft fallback.
+ */
+declare function pairHoldout(candidate: Map<string, Record<string, JudgeScore>>, baseline: Map<string, Record<string, JudgeScore>>, scenarioIds: Set<string>, select: (s: JudgeScore) => number | undefined): PairedHoldout;
+interface HeldoutSignificance {
+    paired: PairedHoldout;
+    bootstrap: PairedBootstrapResult;
+    /** n paired observations. */
+    n: number;
+    /** True iff n >= minProductiveRuns AND the CI lower bound clears the threshold. */
+    significant: boolean;
+    /** Set when n < minProductiveRuns — too little evidence to claim significance. */
+    fewRuns: boolean;
+}
+interface HeldoutSignificanceOptions {
+    deltaThreshold?: number;
+    minProductiveRuns?: number;
+    confidence?: number;
+    resamples?: number;
+    /** Fixed by default for a deterministic, reproducible gate verdict. */
+    seed?: number;
+    statistic?: 'mean' | 'median';
+}
+/** Significance of the held-out composite lift: ship only when the paired
+ *  bootstrap CI lower bound on (candidate − baseline) exceeds `deltaThreshold`
+ *  (default 0 ⇒ "confidently positive"). Below `minProductiveRuns` paired
+ *  observations there is not enough evidence to claim significance → not
+ *  significant (`fewRuns`). Interpret `deltaThreshold` in the judge's native
+ *  composite scale. */
+declare function heldoutSignificance(paired: PairedHoldout, opts?: HeldoutSignificanceOptions): HeldoutSignificance;
+interface DimensionRegression {
+    dimension: string;
+    bootstrap: PairedBootstrapResult;
+    /** True iff the CI lower bound on (candidate − baseline) is below −tolerance:
+     *  the candidate may have regressed this dimension by more than tolerance. */
+    regressed: boolean;
+    tolerance: number;
+    n: number;
+}
+/** Detect the native scale of a set of scores: 0-100 when any magnitude clears
+ *  1.5, else [0,1]. Used to auto-scale the regression tolerance so a default
+ *  expressed for [0,1] is not silently a no-op on a 0-100 dimension. */
+declare function detectScale(values: number[]): 1 | 100;
+/** Per-critical-dimension regression guard. For each dimension, pair the
+ *  candidate vs baseline values by full cellId and bootstrap the paired delta;
+ *  a dimension is "regressed" when the CI lower bound < −tolerance (conservative
+ *  — blocks if the credible worst case exceeds tolerance, which is the right
+ *  posture for safety dimensions like `hallucination_free`). When `tolerance`
+ *  is omitted it auto-scales: 0.05 on [0,1], 5 on 0-100. */
+declare function dimensionRegressions(candidate: Map<string, Record<string, JudgeScore>>, baseline: Map<string, Record<string, JudgeScore>>, scenarioIds: Set<string>, criticalDimensions: string[], opts?: {
+    tolerance?: number;
+    confidence?: number;
+    resamples?: number;
+    seed?: number;
+}): DimensionRegression[];
 /**
  * @experimental
  *
@@ -648,4 +751,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
  *  as a ref under the adapter's worktree dir. */
 declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
-export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, ImprovementDriver, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
+export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, heldoutSignificance, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };

package/dist/campaign/index.js CHANGED Viewed

@@ -1,33 +1,37 @@
 import {
-  buildLoopProvenanceRecord,
   composeGate,
   defaultProductionGate,
-  emitLoopProvenance,
+  detectScale,
+  dimensionRegressions,
   evolutionaryDriver,
-  loopProvenanceSpans,
-  provenanceRecordPath,
-  provenanceSpansPath,
-  runEval,
-  surfaceContentHash
-} from "../chunk-RDK3P4JE.js";
+  heldoutSignificance,
+  pairHoldout,
+  runEval
+} from "../chunk-E24XD7A2.js";
 import {
   agentProfileHash
 } from "../chunk-PQV2TKC3.js";
 import {
+  buildLoopProvenanceRecord,
   campaignBreakdown,
   campaignMeanComposite,
   countSentenceEdits,
   defaultRenderDiff,
+  emitLoopProvenance,
   extractH2Sections,
   gepaDriver,
   heldOutGate,
   isProposedCandidate,
   labelTrustRank,
+  loopProvenanceSpans,
   openAutoPr,
+  provenanceRecordPath,
+  provenanceSpansPath,
   runImprovementLoop,
   runOptimization,
+  surfaceContentHash,
   surfaceHash
-} from "../chunk-Q56RRLEC.js";
+} from "../chunk-JFGZPUMU.js";
 import {
   assertRealBackend,
   fsCampaignStorage,
@@ -1091,6 +1095,8 @@ export {
   countSentenceEdits,
   defaultProductionGate,
   defaultRenderDiff,
+  detectScale,
+  dimensionRegressions,
   emitLoopProvenance,
   evolutionaryDriver,
   extractH2Sections,
@@ -1100,11 +1106,13 @@ export {
   gepaReflectionEntry,
   gitWorktreeAdapter,
   heldOutGate,
+  heldoutSignificance,
   inMemoryCampaignStorage,
   isProposedCandidate,
   labelTrustRank,
   loopProvenanceSpans,
   openAutoPr,
+  pairHoldout,
   parseSkillPatchResponse,
   patchEditCount,
   provenanceRecordPath,