@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/CHANGELOG.md CHANGED
@@ -4,6 +4,31 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
4
4
 
5
5
  ---
6
6
 
7
+ ## [0.67.0] — 2026-05-30 — the promotion gate is statistically trustworthy (no more shipping noise)
8
+
9
+ An adversarial review of a real "ship +4.0 lift" decision found it was a **triple false positive**: the driver's candidate lost on train, so the winner was the baseline (empty diff); the loop re-scored the baseline against ITSELF on the holdout and read run-to-run model noise (91 vs 95) as a "+4 lift"; and a point-estimate gate (`delta >= 0.03` on a 0-100 scale, `reps:1`) shipped it — while the reward-hacking gate was blind to a −30 regression on a safety dimension hiding under the +4 net. The promotion gate could not tell a real improvement from noise or from a Goodhart trade.
10
+
11
+ ### Fixed / Added
12
+
13
+ - **No-op guard** (`runImprovementLoop`) — when the winner is byte-identical to the baseline (no candidate beat the training baseline, empty diff), the loop now forces `hold` and skips the meaningless baseline-vs-itself holdout pass, instead of shipping the noise delta.
14
+ - **Statistical held-out gate** — `defaultProductionGate`'s held-out check is now a **paired bootstrap CI**, not a point estimate. It pairs candidate vs baseline holdout cells by **full `cellId` (`scenario:rep`)** — never averaging reps away — and ships only when the CI lower bound clears `deltaThreshold` (default 0 ⇒ confidently positive). Below `minProductiveRuns` (default 3) paired observations it HOLDS with `few_runs` rather than reading a degenerate interval. (New module `src/campaign/gates/statistical-heldout.ts`; reuses `pairedBootstrap` from `src/statistics.ts`.)
15
+ - **Per-dimension regression guard (anti-Goodhart)** — `criticalDimensions` + `regressionTolerance` on `DefaultProductionGateOptions`. The gate HOLDS if any guarded dimension's paired-delta CI lower bound falls below −tolerance, even when the net composite rose. Tolerance auto-scales (0.05 on [0,1], 5 on 0-100) so a default expressed for one scale isn't a silent no-op on the other.
16
+ - **Exports** `pairHoldout`, `heldoutSignificance`, `dimensionRegressions`, `detectScale` from `/campaign`.
17
+
18
+ This collapses the duplicated gate tech-debt (a rigorous `src/held-out-gate.ts` existed but the loop wired the weak adapter) onto the shared `pairedBootstrap` statistics. 12 new regression tests, including the exact noisy-same-mean false positive and the composite-up/dimension-down Goodhart trade. Full suite (1624) green. The remaining path to a *proven* self-improvement (headroom corpus + Goodhart-resistant measurement, driver effectiveness, inter-cycle compounding) is tracked separately.
19
+
20
+ ## [0.66.0] — 2026-05-30 — the improvement loop can no longer hang silently or ingest to the wrong URL
21
+
22
+ ### Fixed
23
+
24
+ - **`runCampaign` per-cell dispatch deadline (`dispatchTimeoutMs`).** A dispatch that neither resolves nor rejects — a stalled model request, an exhausted runtime resource, a stream that never closes — used to hang the cell, and with it the lane, the campaign, `runImprovementLoop`, and the CI job above them, **forever, with no diagnostic**. The cell now races its dispatch against the deadline; on timeout it aborts the cell's `ctx.signal` and records a LOUD error (`dispatch exceeded <N>ms`) while the campaign proceeds. `undefined`/`0` = unbounded (legacy).
25
+ - **`runImprovementLoop` fails loud on an empty holdout.** When every holdout dispatch or judge errored, the gate read both means as 0, computed delta 0, and silently **"held" on garbage** — indistinguishable from a real no-lift result, masking upstream crashes (e.g. a consumer scorer that threw on a malformed scenario). The loop now throws a diagnostic error naming the first underlying failure instead of emitting a verdict over zero scorable cells. It also applies a default per-cell deadline (`DEFAULT_DISPATCH_TIMEOUT_MS`, 10 min, overridable) to every campaign it runs.
26
+ - **Hosted ingest URL normalization.** The client appends the versioned `/v1/ingest/...` path itself, but callers (and the client's own prior doc) routinely pass the versioned base `https://host/v1` — producing `/v1/v1/ingest/...` → **404, silently dropping every event**. `post()` now strips a trailing `/v1` (and slashes) from the endpoint so both `https://host` and `https://host/v1` resolve correctly; the doc now shows the bare host.
27
+
28
+ ### Why it matters
29
+
30
+ These three were a single failure chain in production: a consumer's judge threw on a subset of scenarios → the holdout produced no scorable cells → the loop hung instead of failing loud → no decision, no provenance — and even when it did complete, the activated ingest env (`…/v1`) 404'd. The loop now either completes with real data or fails loud, and its provenance lands.
31
+
7
32
  ## [0.65.0] — 2026-05-30 — `emitLoopProvenance` ships the eval-run event too (full dashboard visibility)
8
33
 
9
34
  ### Fixed
@@ -1,4 +1,4 @@
1
- import { T as TraceSpanEvent, H as HostedClient } from '../index-CzhtwYBT.js';
1
+ import { T as TraceSpanEvent, H as HostedClient } from '../index-DSEHMwvS.js';
2
2
  import '../types-c2R2kfmv.js';
3
3
  import '../run-record-BgTFzO2r.js';
4
4
  import '../errors-Dwqw-T_m.js';
@@ -1,8 +1,10 @@
1
- import { a as RunCampaignOptions, C as CampaignStorage } from '../provenance-lqyLpOYR.js';
2
- export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, m as EmitLoopProvenanceArgs, n as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, o as GepaDriverConstraints, G as GepaDriverOptions, H as HeldOutGateOptions, p as LoopProvenanceBackend, q as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as OpenAutoPrOptions, s as OpenAutoPrResult, b as RunEvalOptions, c as RunImprovementLoopOptions, R as RunImprovementLoopResult, t as RunOptimizationOptions, u as RunOptimizationResult, v as buildLoopProvenanceRecord, d as composeGate, w as countSentenceEdits, e as defaultProductionGate, x as defaultRenderDiff, y as emitLoopProvenance, f as evolutionaryDriver, z as extractH2Sections, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, A as loopProvenanceSpans, F as openAutoPr, I as provenanceRecordPath, J as provenanceSpansPath, r as runCampaign, k as runEval, l as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-lqyLpOYR.js';
1
+ import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-BKpM5T4t.js';
2
+ export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BKpM5T4t.js';
3
+ export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-CChUqexv.js';
3
4
  import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
4
- import { I as ImprovementDriver, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
5
- export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
5
+ import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
6
+ export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
7
+ import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
6
8
  import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
7
9
  import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
8
10
  import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
@@ -10,11 +12,13 @@ import '../red-team-DW9Ca_tj.js';
10
12
  import '../dataset-B2kL-fSM.js';
11
13
  import '../store-CKUAgsJz.js';
12
14
  import '../schema-m0gsnbt3.js';
13
- import '../index-CzhtwYBT.js';
15
+ import '../index-DSEHMwvS.js';
14
16
  import '../summary-report-ByiOUrHj.js';
15
17
  import '../failure-cluster-CL7IVgkJ.js';
16
18
  import '../judge-calibration-DilmB3Ml.js';
17
19
  import '../raw-provider-sink-C46HDghv.js';
20
+ import '../types-Croy5h7V.js';
21
+ import '@tangle-network/tcloud';
18
22
 
19
23
  /**
20
24
  * @experimental
@@ -163,6 +167,106 @@ declare class SkillPatchParseError extends Error {
163
167
  }
164
168
  declare function parseSkillPatchResponse(raw: string, maxPatches: number, editBudget: number): SkillPatch[];
165
169
 
170
+ /**
171
+ * @experimental
172
+ *
173
+ * Statistical held-out promotion machinery — the trustworthy core the
174
+ * point-estimate `heldout-delta` gate lacked.
175
+ *
176
+ * The shipped false positive it prevents: a winner re-scored against the
177
+ * baseline on the holdout read run-to-run model NOISE (e.g. 91 vs 95) as a
178
+ * "+4 lift" and shipped, because the gate compared point estimates with no
179
+ * confidence interval. Here we pair candidate vs baseline holdout observations
180
+ * and bootstrap a CI on the paired delta — a candidate ships only when the CI
181
+ * lower bound clears the effect-size threshold (the gain is real at the
182
+ * confidence level, not noise), and is blocked when a critical dimension
183
+ * (e.g. `hallucination_free` for a legal agent) significantly regresses even if
184
+ * the net composite rose (anti-Goodhart).
185
+ *
186
+ * Two traps this module is built around (both produce a NEW false positive if
187
+ * gotten wrong):
188
+ * 1. PAIRING GRANULARITY — pairs by FULL `cellId` (`scenario:rep`), never by
189
+ * `scenarioId` (which averages reps away and destroys the within-pair
190
+ * variance reduction that makes a paired bootstrap tighter than unpaired).
191
+ * One paired observation per cell ⇒ reps multiply n.
192
+ * 2. SCALE — a judge may emit composites/dimensions on [0,1] or 0-100. The
193
+ * threshold + tolerance are interpreted in the judge's NATIVE scale; the
194
+ * per-dimension tolerance auto-scales off the observed baseline magnitudes
195
+ * so `-0.10` on [0,1] doesn't silently become a no-op on a 0-100 dimension.
196
+ */
197
+
198
+ interface PairedHoldout {
199
+ /** Baseline scalar per paired cell (same order as `after`/`cellIds`). */
200
+ before: number[];
201
+ /** Candidate scalar per paired cell. */
202
+ after: number[];
203
+ /** The full cellIds (`scenario:rep`) that paired, in order. */
204
+ cellIds: string[];
205
+ }
206
+ /**
207
+ * Pair candidate vs baseline holdout observations by FULL cellId. `select`
208
+ * pulls the scalar from a cell's judge reports (composite, or a named
209
+ * dimension); a cell contributes the mean of `select` across its judges. Cells
210
+ * whose scenario is not in `scenarioIds`, or where `select` is undefined for
211
+ * every judge on either side, are skipped on BOTH sides so the arrays stay
212
+ * paired. Throws when the two maps disagree on which holdout cells exist — a
213
+ * load-bearing invariant: the baseline + winner holdout campaigns run the same
214
+ * scenarios with the same seed base, so their cellIds MUST align; a mismatch
215
+ * means a silent pairing bug, not a soft fallback.
216
+ */
217
+ declare function pairHoldout(candidate: Map<string, Record<string, JudgeScore>>, baseline: Map<string, Record<string, JudgeScore>>, scenarioIds: Set<string>, select: (s: JudgeScore) => number | undefined): PairedHoldout;
218
+ interface HeldoutSignificance {
219
+ paired: PairedHoldout;
220
+ bootstrap: PairedBootstrapResult;
221
+ /** n paired observations. */
222
+ n: number;
223
+ /** True iff n >= minProductiveRuns AND the CI lower bound clears the threshold. */
224
+ significant: boolean;
225
+ /** Set when n < minProductiveRuns — too little evidence to claim significance. */
226
+ fewRuns: boolean;
227
+ }
228
+ interface HeldoutSignificanceOptions {
229
+ deltaThreshold?: number;
230
+ minProductiveRuns?: number;
231
+ confidence?: number;
232
+ resamples?: number;
233
+ /** Fixed by default for a deterministic, reproducible gate verdict. */
234
+ seed?: number;
235
+ statistic?: 'mean' | 'median';
236
+ }
237
+ /** Significance of the held-out composite lift: ship only when the paired
238
+ * bootstrap CI lower bound on (candidate − baseline) exceeds `deltaThreshold`
239
+ * (default 0 ⇒ "confidently positive"). Below `minProductiveRuns` paired
240
+ * observations there is not enough evidence to claim significance → not
241
+ * significant (`fewRuns`). Interpret `deltaThreshold` in the judge's native
242
+ * composite scale. */
243
+ declare function heldoutSignificance(paired: PairedHoldout, opts?: HeldoutSignificanceOptions): HeldoutSignificance;
244
+ interface DimensionRegression {
245
+ dimension: string;
246
+ bootstrap: PairedBootstrapResult;
247
+ /** True iff the CI lower bound on (candidate − baseline) is below −tolerance:
248
+ * the candidate may have regressed this dimension by more than tolerance. */
249
+ regressed: boolean;
250
+ tolerance: number;
251
+ n: number;
252
+ }
253
+ /** Detect the native scale of a set of scores: 0-100 when any magnitude clears
254
+ * 1.5, else [0,1]. Used to auto-scale the regression tolerance so a default
255
+ * expressed for [0,1] is not silently a no-op on a 0-100 dimension. */
256
+ declare function detectScale(values: number[]): 1 | 100;
257
+ /** Per-critical-dimension regression guard. For each dimension, pair the
258
+ * candidate vs baseline values by full cellId and bootstrap the paired delta;
259
+ * a dimension is "regressed" when the CI lower bound < −tolerance (conservative
260
+ * — blocks if the credible worst case exceeds tolerance, which is the right
261
+ * posture for safety dimensions like `hallucination_free`). When `tolerance`
262
+ * is omitted it auto-scales: 0.05 on [0,1], 5 on 0-100. */
263
+ declare function dimensionRegressions(candidate: Map<string, Record<string, JudgeScore>>, baseline: Map<string, Record<string, JudgeScore>>, scenarioIds: Set<string>, criticalDimensions: string[], opts?: {
264
+ tolerance?: number;
265
+ confidence?: number;
266
+ resamples?: number;
267
+ seed?: number;
268
+ }): DimensionRegression[];
269
+
166
270
  /**
167
271
  * @experimental
168
272
  *
@@ -647,4 +751,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
647
751
  * as a ref under the adapter's worktree dir. */
648
752
  declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
649
753
 
650
- export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, ImprovementDriver, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
754
+ export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, heldoutSignificance, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
@@ -1,41 +1,44 @@
1
1
  import {
2
2
  buildLoopProvenanceRecord,
3
- campaignBreakdown,
4
- campaignMeanComposite,
5
3
  composeGate,
6
- countSentenceEdits,
7
4
  defaultProductionGate,
8
- defaultRenderDiff,
5
+ detectScale,
6
+ dimensionRegressions,
9
7
  emitLoopProvenance,
10
8
  evolutionaryDriver,
9
+ heldoutSignificance,
10
+ loopProvenanceSpans,
11
+ pairHoldout,
12
+ provenanceRecordPath,
13
+ provenanceSpansPath,
14
+ runEval,
15
+ surfaceContentHash
16
+ } from "../chunk-MZ2IYGGN.js";
17
+ import {
18
+ agentProfileHash
19
+ } from "../chunk-PQV2TKC3.js";
20
+ import {
21
+ campaignBreakdown,
22
+ campaignMeanComposite,
23
+ countSentenceEdits,
24
+ defaultRenderDiff,
11
25
  extractH2Sections,
12
26
  gepaDriver,
13
27
  heldOutGate,
14
28
  isProposedCandidate,
15
29
  labelTrustRank,
16
- loopProvenanceSpans,
17
30
  openAutoPr,
18
- provenanceRecordPath,
19
- provenanceSpansPath,
20
- runEval,
21
31
  runImprovementLoop,
22
32
  runOptimization,
23
- surfaceContentHash,
24
33
  surfaceHash
25
- } from "../chunk-CZRKD2X2.js";
34
+ } from "../chunk-NV2PF37Q.js";
26
35
  import {
36
+ assertRealBackend,
27
37
  fsCampaignStorage,
28
38
  inMemoryCampaignStorage,
29
- runCampaign
30
- } from "../chunk-7TPYV2ER.js";
31
- import {
32
- agentProfileHash
33
- } from "../chunk-PQV2TKC3.js";
34
- import "../chunk-4ODZXQV2.js";
35
- import {
36
- assertRealBackend,
39
+ runCampaign,
37
40
  summarizeBackendIntegrity
38
- } from "../chunk-E22YUOAL.js";
41
+ } from "../chunk-6XQIEUQ2.js";
39
42
  import "../chunk-YV7J7X5N.js";
40
43
  import {
41
44
  validateRunRecord
@@ -1092,6 +1095,8 @@ export {
1092
1095
  countSentenceEdits,
1093
1096
  defaultProductionGate,
1094
1097
  defaultRenderDiff,
1098
+ detectScale,
1099
+ dimensionRegressions,
1095
1100
  emitLoopProvenance,
1096
1101
  evolutionaryDriver,
1097
1102
  extractH2Sections,
@@ -1101,11 +1106,13 @@ export {
1101
1106
  gepaReflectionEntry,
1102
1107
  gitWorktreeAdapter,
1103
1108
  heldOutGate,
1109
+ heldoutSignificance,
1104
1110
  inMemoryCampaignStorage,
1105
1111
  isProposedCandidate,
1106
1112
  labelTrustRank,
1107
1113
  loopProvenanceSpans,
1108
1114
  openAutoPr,
1115
+ pairHoldout,
1109
1116
  parseSkillPatchResponse,
1110
1117
  patchEditCount,
1111
1118
  provenanceRecordPath,