@tangle-network/agent-eval 0.61.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +388 -11
- package/dist/campaign/index.js +597 -12
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
- package/dist/chunk-4ODZXQV2.js.map +1 -0
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
- package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +331 -128
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,24 +4,64 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
-
## [0.
|
|
7
|
+
## [0.63.0] — 2026-05-30 — the full optimizer drivers: GEPA Pareto + SkillOpt + a head-to-head lift benchmark
|
|
8
|
+
|
|
9
|
+
Closes the optimizer-completeness gap (#101/#100). `gepaDriver` was reflection-only; the SOTA SkillOpt technique was roadmapped but unbuilt; and there was no head-to-head benchmark, so optimizer quality was measurement-invisible — a simplified driver could ship unnoticed. This release ships both drivers in full and the forcing function that keeps them honest.
|
|
8
10
|
|
|
9
11
|
### Added
|
|
10
12
|
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
+
- **GEPA Pareto frontier + combine-complementary-lessons (#101).** `runOptimization` now accumulates every scored surface as a per-scenario objective vector and recomputes the non-dominated set before each generation, handing it to the driver as `ctx.paretoParents` (new `ParetoParent` type). A surface uniquely best on one hard scenario survives even when its mean composite is lower. `gepaDriver` spends one population slot merging the frontier parents' complementary strengths (toggle via `combineParents`, default on; fires only when the frontier has >1 member). `RunOptimizationResult.paretoFrontier` exposes the final frontier. Dominance is computed by the package-canonical `paretoFrontier` (`src/pareto.ts`) — the parallel `src/campaign/pareto.ts` fork has been deleted (one dominance implementation).
|
|
14
|
+
- **SkillOpt patch-mode driver + `runSkillOpt` preset (#100)** (Microsoft, arXiv:2605.23904). `skillOptDriver` proposes BOUNDED add/delete/replace patches to one skill document (`applySkillPatch`, `SkillPatch`); `runSkillOpt` is the held-out-gated epoch hill-climb: reflect on TRAIN weaknesses → propose ≤ `editBudget` ops → score on the held-out split → ACCEPT only on STRICT held-out improvement, else buffer the rejected edit; with edit-budget annealing (the "textual learning rate") and a slow-update meta note. The held-out composite is monotonically non-decreasing by construction — a regression can never ship. Proposals reflect on train evidence only (no held-out leakage).
|
|
15
|
+
- **`compareDrivers` head-to-head lift benchmark (the forcing function).** Runs N optimizer entries on ONE corpus, scores the baseline + every promoted surface UNIFORMLY on the same held-out scenarios, and reports per-driver lift + paired-bootstrap CI + pairwise "which driver wins" CIs, ranked (cost breaks a lift tie). Ships `gepaReflectionEntry` / `gepaParetoEntry` / `skillOptEntry` to wire the real optimizers. Optimizer quality is now a number with a confidence interval — a driver regression turns a build red instead of going invisible.
|
|
16
|
+
- **`campaignMeanComposite` / `campaignBreakdown`** (`score-utils`) — the one definition of "composite of a campaign" + per-scenario/dimension breakdown, now shared by `runOptimization`, `runSkillOpt`, and `compareDrivers` (extracted from `runOptimization`'s private copies).
|
|
13
17
|
|
|
14
|
-
###
|
|
18
|
+
### Changed
|
|
15
19
|
|
|
16
|
-
-
|
|
20
|
+
- `gepaDriver`'s docstring + new `combineParents`/`combineMaxParents` options reflect the now-complete GEPA mapping (reflection + Pareto + combine).
|
|
17
21
|
|
|
18
|
-
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## [0.62.0] — 2026-05-30 — eval↔runtime boundary hardening (honest cost meter + per-cell stub guard)
|
|
25
|
+
|
|
26
|
+
From the agent-eval ↔ agent-runtime boundary critique. Builds on `runProfileMatrix` (0.61.0).
|
|
27
|
+
|
|
28
|
+
### Fixed
|
|
29
|
+
|
|
30
|
+
- **`CampaignCostMeter` docstring no longer lies.** It claimed "Substrate auto-tracks LLM costs via the cost-ledger backend hooks" — false (the meter mutates only on explicit `observe`/`observeTokens`), and it contradicted `observeTokens`' own doc. That doc was the root cause of consumers skipping `observeTokens`, getting `{0,0}` stub cells, and building `RunRecord`s on a side-channel. The doc now states plainly: nothing is captured automatically; the dispatch MUST report.
|
|
31
|
+
|
|
32
|
+
### Added
|
|
33
|
+
|
|
34
|
+
- **`runCampaign({ expectUsage })`** — per-cell stub guard, the early/fine-grained sibling of batch `assertRealBackend`. A cell that produced an artifact but reported `costUsd === 0` AND zero tokens is a stub. Modes: `'warn'` (default, non-breaking), `'assert'` (throw `BackendIntegrityError` on the first stub cell), `'off'` (replay/offline). Errored/skipped cells and deterministic judge-only runs are not flagged.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- **`CampaignTokenUsage` is now `type CampaignTokenUsage = RunTokenUsage`** (one source of truth; a field added to `RunTokenUsage` is a compile error here, not silent drift across the three hand-synced copies the audit found).
|
|
39
|
+
- **multishot aliases sandbox's `AgentProfile` → `SandboxAgentProfile`** so it no longer collides with the eval-harness `AgentProfile` the root exports.
|
|
40
|
+
|
|
41
|
+
### Boundary
|
|
42
|
+
|
|
43
|
+
- **`tests/boundary-integrity.test.ts`** — mechanically enforces the zero-upward-dependency rule (agent-eval must never import agent-runtime/agent-knowledge). The CLAUDE.md rule was prose-only; it is now a red build.
|
|
44
|
+
|
|
45
|
+
### Notes
|
|
46
|
+
|
|
47
|
+
Pure additive/doc surface (`expectUsage` defaults to non-breaking `'warn'`). Full suite 1538/1538 green. Consumes-side: agent-runtime `loopDispatch` (0.32.0) turns the whole seam into one un-mis-wireable call.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## [0.61.0] — 2026-05-30 — `runProfileMatrix` (profile × scenario × persona matrix with integrity by construction)
|
|
52
|
+
|
|
53
|
+
### Added
|
|
54
|
+
|
|
55
|
+
- **`runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, personaOf })`** (`@tangle-network/agent-eval/campaign`) — the keystone that lets a consumer express a multi-profile × scenario/persona eval as **one** call instead of a hand-rolled `eval:*` script. Fans `profiles` over the scenario/persona corpus, runs `runCampaign` per profile, maps every cell to a validated `RunRecord` carrying real `tokenUsage`, and runs **`assertRealBackend` by construction**. Returns `{ records, byProfile, byScenario, byPersona, integrity, campaigns }`.
|
|
56
|
+
- **`ProfileMatrixError`** — thrown at preflight (before any LLM spend) when a profile's model lacks a snapshot version or the lists are empty.
|
|
57
|
+
|
|
58
|
+
### Fixed / closed gap
|
|
19
59
|
|
|
20
|
-
|
|
60
|
+
- **Token usage captured by `runCampaign`** — `CampaignCostMeter` gains `observeTokens()`/`tokens()` and `CampaignCellResult` gains `tokenUsage`, so the integrity guards can run on a `CampaignResult` (they key on `tokenUsage`). Closes the gap for **every** campaign consumer.
|
|
21
61
|
|
|
22
62
|
### Notes
|
|
23
63
|
|
|
24
|
-
|
|
64
|
+
7 new tests; the keystone is the **stub→throws** regression. Full suite 1527/1527 green at release.
|
|
25
65
|
|
|
26
66
|
---
|
|
27
67
|
|
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-c2R2kfmv.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
|
+
import '../errors-Dwqw-T_m.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
2
5
|
|
|
3
6
|
/**
|
|
4
7
|
* # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-c2R2kfmv.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
|
+
import '../errors-Dwqw-T_m.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
2
5
|
|
|
3
6
|
/**
|
|
4
7
|
* # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { T as TraceSpanEvent, H as HostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
3
|
-
import '../
|
|
4
|
-
import '../run-record-DgUVo5pw.js';
|
|
1
|
+
import { T as TraceSpanEvent, H as HostedClient } from '../index-GISRh500.js';
|
|
2
|
+
import '../types-c2R2kfmv.js';
|
|
3
|
+
import '../run-record-BgTFzO2r.js';
|
|
5
4
|
import '../errors-Dwqw-T_m.js';
|
|
6
5
|
import '../schema-m0gsnbt3.js';
|
|
6
|
+
import '../summary-report-ByiOUrHj.js';
|
|
7
7
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
8
8
|
import '../store-CKUAgsJz.js';
|
|
9
9
|
import '../judge-calibration-DilmB3Ml.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DsnOpCO6.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,20 +1,167 @@
|
|
|
1
|
-
import { C as CampaignStorage } from '../provenance-
|
|
2
|
-
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions,
|
|
3
|
-
import { L as
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import { a as RunCampaignOptions, C as CampaignStorage } from '../provenance-cUnovpWV.js';
|
|
2
|
+
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, m as EmitLoopProvenanceArgs, n as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, o as GepaDriverConstraints, G as GepaDriverOptions, H as HeldOutGateOptions, p as LoopProvenanceBackend, q as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as OpenAutoPrOptions, s as OpenAutoPrResult, b as RunEvalOptions, c as RunImprovementLoopOptions, R as RunImprovementLoopResult, t as RunOptimizationOptions, u as RunOptimizationResult, v as buildLoopProvenanceRecord, d as composeGate, w as countSentenceEdits, e as defaultProductionGate, x as defaultRenderDiff, y as emitLoopProvenance, f as evolutionaryDriver, z as extractH2Sections, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, A as loopProvenanceSpans, F as openAutoPr, I as provenanceRecordPath, J as provenanceSpansPath, r as runCampaign, k as runEval, l as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-cUnovpWV.js';
|
|
3
|
+
import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
|
|
4
|
+
import { I as ImprovementDriver, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-c2R2kfmv.js';
|
|
5
|
+
export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-c2R2kfmv.js';
|
|
6
|
+
import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
|
|
6
7
|
import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
|
|
7
|
-
import {
|
|
8
|
-
import '../llm-client-DbjLfz-K.js';
|
|
9
|
-
import '../raw-provider-sink-C46HDghv.js';
|
|
8
|
+
import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
|
|
10
9
|
import '../red-team-DW9Ca_tj.js';
|
|
11
10
|
import '../dataset-B2kL-fSM.js';
|
|
12
11
|
import '../store-CKUAgsJz.js';
|
|
13
12
|
import '../schema-m0gsnbt3.js';
|
|
14
|
-
import '../index-
|
|
15
|
-
import '../summary-report-
|
|
13
|
+
import '../index-GISRh500.js';
|
|
14
|
+
import '../summary-report-ByiOUrHj.js';
|
|
16
15
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
17
16
|
import '../judge-calibration-DilmB3Ml.js';
|
|
17
|
+
import '../raw-provider-sink-C46HDghv.js';
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @experimental
|
|
21
|
+
*
|
|
22
|
+
* SkillOpt patch primitives (Microsoft, arXiv:2605.23904 — "Executive
|
|
23
|
+
* Strategy for Self-Evolving Agent Skills"). Where GEPA regenerates a surface
|
|
24
|
+
* by reflection, SkillOpt emits BOUNDED, anchored edits to ONE skill document
|
|
25
|
+
* — add / delete / replace — and accepts an edit only if it strictly improves
|
|
26
|
+
* a held-out score. Bounded edits are the "textual learning rate": small,
|
|
27
|
+
* reversible, and cheap to accept/reject, so a good rule introduced earlier is
|
|
28
|
+
* not overwritten by a later sweeping rewrite.
|
|
29
|
+
*
|
|
30
|
+
* This module applies a patch deterministically and reports, per op, what
|
|
31
|
+
* applied and what could not (a missing anchor is a rejected op, never a
|
|
32
|
+
* silently dropped one). Pure, no I/O.
|
|
33
|
+
*/
|
|
34
|
+
/** A single bounded edit against a skill surface.
|
|
35
|
+
* - `add` — insert `text` after the first line containing `after`
|
|
36
|
+
* (append to the end when `after` is absent/empty).
|
|
37
|
+
* - `delete` — remove the first line containing `anchor`.
|
|
38
|
+
* - `replace` — replace the first line containing `anchor` with `text`.
|
|
39
|
+
* `text` may be multi-line; it is spliced in as multiple lines. Anchors match
|
|
40
|
+
* the FIRST line that contains the substring (deterministic; SkillOpt is
|
|
41
|
+
* expected to anchor on unique text). */
|
|
42
|
+
type SkillPatchOp = {
|
|
43
|
+
op: 'add';
|
|
44
|
+
after?: string;
|
|
45
|
+
text: string;
|
|
46
|
+
} | {
|
|
47
|
+
op: 'delete';
|
|
48
|
+
anchor: string;
|
|
49
|
+
} | {
|
|
50
|
+
op: 'replace';
|
|
51
|
+
anchor: string;
|
|
52
|
+
text: string;
|
|
53
|
+
};
|
|
54
|
+
/** A named, attributable bundle of ops the optimizer proposes as one edit. */
|
|
55
|
+
interface SkillPatch {
|
|
56
|
+
label: string;
|
|
57
|
+
rationale: string;
|
|
58
|
+
ops: SkillPatchOp[];
|
|
59
|
+
}
|
|
60
|
+
interface SkillPatchRejection {
|
|
61
|
+
op: SkillPatchOp;
|
|
62
|
+
reason: string;
|
|
63
|
+
}
|
|
64
|
+
interface ApplySkillPatchResult {
|
|
65
|
+
surface: string;
|
|
66
|
+
/** Count of ops that mutated the surface. */
|
|
67
|
+
applied: number;
|
|
68
|
+
/** Ops that could not apply (unanchored / empty), with the reason. The
|
|
69
|
+
* surface still reflects every APPLIED op — partial application is honest,
|
|
70
|
+
* and the caller decides whether a partial patch is worth scoring. */
|
|
71
|
+
rejected: SkillPatchRejection[];
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Apply a SkillOpt patch to a text surface. Ops apply in array order against
|
|
75
|
+
* the evolving line buffer (an `add after X` followed by a `delete X` sees the
|
|
76
|
+
* inserted lines). A missing anchor rejects only that op; the rest still apply.
|
|
77
|
+
*/
|
|
78
|
+
declare function applySkillPatch(surface: string, patch: SkillPatch): ApplySkillPatchResult;
|
|
79
|
+
/** Total ops in a patch — the edit-budget axis (SkillOpt's "textual learning
|
|
80
|
+
* rate" caps this per epoch). */
|
|
81
|
+
declare function patchEditCount(patch: SkillPatch): number;
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* @experimental
|
|
85
|
+
*
|
|
86
|
+
* `skillOptDriver` — a patch-mode `ImprovementDriver` implementing SkillOpt
|
|
87
|
+
* (Microsoft, arXiv:2605.23904). Where `gepaDriver` regenerates the whole
|
|
88
|
+
* surface by reflection, SkillOpt proposes BOUNDED, anchored edits
|
|
89
|
+
* (add/delete/replace) to ONE skill document, so a good rule introduced
|
|
90
|
+
* earlier is not clobbered by a later sweeping rewrite. The edit budget is the
|
|
91
|
+
* paper's "textual learning rate"; a rejected-edit buffer + a slow-update
|
|
92
|
+
* meta-note steer the optimizer away from dead ends.
|
|
93
|
+
*
|
|
94
|
+
* This module is the PROPOSER — the LLM call that turns evidence into
|
|
95
|
+
* structured patches. The accept-only-if-held-out-improves loop, the budget
|
|
96
|
+
* annealing, and the rejected buffer live in the `runSkillOpt` preset, which
|
|
97
|
+
* owns the epoch hill-climb. The driver also conforms to `ImprovementDriver`
|
|
98
|
+
* (`propose` applies its patches to the current surface and returns the
|
|
99
|
+
* candidate surfaces) so it is a drop-in for `runOptimization` and a fair
|
|
100
|
+
* entrant in `compareDrivers`.
|
|
101
|
+
*/
|
|
102
|
+
|
|
103
|
+
/** Evidence the optimizer reflects on: where the current surface is weakest.
|
|
104
|
+
* Computed by the caller (the preset uses a TRAIN campaign so proposals never
|
|
105
|
+
* see the held-out split; the generic loop derives it from history). */
|
|
106
|
+
interface SkillOptEvidence {
|
|
107
|
+
/** Lowest-scoring scenarios (drives WHICH behavior to patch). */
|
|
108
|
+
weakScenarios: Array<{
|
|
109
|
+
scenarioId: string;
|
|
110
|
+
composite: number;
|
|
111
|
+
}>;
|
|
112
|
+
/** Lowest-scoring judge dimensions (drives WHAT to patch for). */
|
|
113
|
+
weakDimensions: Array<{
|
|
114
|
+
dimension: string;
|
|
115
|
+
score: number;
|
|
116
|
+
}>;
|
|
117
|
+
}
|
|
118
|
+
/** A patch that was tried and not accepted — fed back to the model so it does
|
|
119
|
+
* not re-propose a dead end (SkillOpt's rejected-edit buffer). */
|
|
120
|
+
interface RejectedEdit {
|
|
121
|
+
label: string;
|
|
122
|
+
rationale: string;
|
|
123
|
+
reason: string;
|
|
124
|
+
}
|
|
125
|
+
interface ProposePatchesArgs {
|
|
126
|
+
surface: string;
|
|
127
|
+
evidence: SkillOptEvidence;
|
|
128
|
+
/** Max ops per patch this round (the annealed textual learning rate). */
|
|
129
|
+
editBudget: number;
|
|
130
|
+
rejectedBuffer: RejectedEdit[];
|
|
131
|
+
/** Slow-update meta guidance accumulated across epochs. */
|
|
132
|
+
metaNote?: string;
|
|
133
|
+
/** How many candidate patches to propose. */
|
|
134
|
+
count: number;
|
|
135
|
+
signal: AbortSignal;
|
|
136
|
+
}
|
|
137
|
+
interface SkillOptDriverOptions {
|
|
138
|
+
llm: LlmClientOptions;
|
|
139
|
+
model: string;
|
|
140
|
+
/** What the skill document governs — orients the prompt. */
|
|
141
|
+
target: string;
|
|
142
|
+
/** Default ops-per-patch cap when used as a bare `ImprovementDriver`. The
|
|
143
|
+
* `runSkillOpt` preset overrides this per epoch as it anneals. Default 3. */
|
|
144
|
+
editBudget?: number;
|
|
145
|
+
temperature?: number;
|
|
146
|
+
maxTokens?: number;
|
|
147
|
+
/** Top-K weak scenarios/dimensions surfaced as evidence. Default 3. */
|
|
148
|
+
evidenceK?: number;
|
|
149
|
+
}
|
|
150
|
+
interface SkillOptDriver extends ImprovementDriver {
|
|
151
|
+
/** Patch-native path used by `runSkillOpt` (the SkillOpt epoch loop owns
|
|
152
|
+
* acceptance/budget/buffer). Returns structured patches, NOT surfaces. */
|
|
153
|
+
proposePatches(args: ProposePatchesArgs): Promise<SkillPatch[]>;
|
|
154
|
+
}
|
|
155
|
+
declare function skillOptDriver(opts: SkillOptDriverOptions): SkillOptDriver;
|
|
156
|
+
/** Parse + validate the patch response. Throws `SkillPatchParseError` when the
|
|
157
|
+
* response is not valid JSON at all (a router/model failure the caller must
|
|
158
|
+
* see — never a silent no-op epoch). Returns `[]` only for the legitimate
|
|
159
|
+
* "valid JSON, zero usable patches" case. Malformed ops within a patch are
|
|
160
|
+
* dropped (not silently mutated); each patch is truncated to the edit budget. */
|
|
161
|
+
declare class SkillPatchParseError extends Error {
|
|
162
|
+
constructor(message: string);
|
|
163
|
+
}
|
|
164
|
+
declare function parseSkillPatchResponse(raw: string, maxPatches: number, editBudget: number): SkillPatch[];
|
|
18
165
|
|
|
19
166
|
/**
|
|
20
167
|
* @experimental
|
|
@@ -77,6 +224,114 @@ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
|
|
|
77
224
|
private pathForSource;
|
|
78
225
|
}
|
|
79
226
|
|
|
227
|
+
/**
|
|
228
|
+
* @experimental
|
|
229
|
+
*
|
|
230
|
+
* `compareDrivers` — a head-to-head lift benchmark across optimizer drivers on
|
|
231
|
+
* ONE corpus. This is the forcing function: optimizer quality (GEPA reflection
|
|
232
|
+
* vs GEPA+Pareto vs SkillOpt) becomes a NUMBER with a confidence interval, so a
|
|
233
|
+
* driver regression — or shipping a simplified driver and calling it the real
|
|
234
|
+
* one — turns a build red instead of going measurement-invisible.
|
|
235
|
+
*
|
|
236
|
+
* Every entrant is scored the SAME way: each driver returns the surface it
|
|
237
|
+
* promoted, then `compareDrivers` scores the baseline + every winner on the
|
|
238
|
+
* SAME held-out scenarios with the SAME judges. Apples-to-apples by
|
|
239
|
+
* construction — the comparison never depends on how a driver measured itself.
|
|
240
|
+
* The per-scenario held-out composites feed a paired bootstrap (`statistics.ts`)
|
|
241
|
+
* for each driver's lift CI and for the pairwise "which driver wins" CI.
|
|
242
|
+
*/
|
|
243
|
+
|
|
244
|
+
/** What an optimizer produced: the surface it promoted + what it cost to get
|
|
245
|
+
* there. `compareDrivers` does the held-out scoring itself, so an entry only
|
|
246
|
+
* needs to run its loop and hand back the winner. */
|
|
247
|
+
interface DriverEntry {
|
|
248
|
+
name: string;
|
|
249
|
+
optimize: () => Promise<{
|
|
250
|
+
winnerSurface: MutableSurface;
|
|
251
|
+
costUsd: number;
|
|
252
|
+
durationMs?: number;
|
|
253
|
+
}>;
|
|
254
|
+
}
|
|
255
|
+
interface DriverScore {
|
|
256
|
+
name: string;
|
|
257
|
+
/** Mean held-out composite of the baseline (identical across drivers). */
|
|
258
|
+
baselineComposite: number;
|
|
259
|
+
/** Mean held-out composite of this driver's promoted surface. */
|
|
260
|
+
winnerComposite: number;
|
|
261
|
+
/** Mean per-scenario held-out lift (winner − baseline). */
|
|
262
|
+
lift: number;
|
|
263
|
+
/** Paired-bootstrap CI of the per-scenario lift. `low > 0` ⇒ a real gain. */
|
|
264
|
+
liftCi: {
|
|
265
|
+
low: number;
|
|
266
|
+
high: number;
|
|
267
|
+
};
|
|
268
|
+
costUsd: number;
|
|
269
|
+
durationMs?: number;
|
|
270
|
+
winnerSurface: MutableSurface;
|
|
271
|
+
/** 1-based, by descending lift. */
|
|
272
|
+
rank: number;
|
|
273
|
+
}
|
|
274
|
+
interface DriverPairwise {
|
|
275
|
+
/** Higher-ranked driver. */
|
|
276
|
+
a: string;
|
|
277
|
+
b: string;
|
|
278
|
+
/** Mean per-scenario held-out delta (a − b). */
|
|
279
|
+
deltaMean: number;
|
|
280
|
+
low: number;
|
|
281
|
+
high: number;
|
|
282
|
+
/** `a` if the CI clears 0, `b` if it is entirely negative, else `'tie'`. */
|
|
283
|
+
favored: string;
|
|
284
|
+
}
|
|
285
|
+
interface DriverComparison {
|
|
286
|
+
/** Sorted by descending lift; `rank` set accordingly. */
|
|
287
|
+
scores: DriverScore[];
|
|
288
|
+
best: DriverScore;
|
|
289
|
+
/** Best vs each other driver, paired-bootstrap on the held-out winners. */
|
|
290
|
+
pairwise: DriverPairwise[];
|
|
291
|
+
holdoutScenarioIds: string[];
|
|
292
|
+
}
|
|
293
|
+
interface CompareDriversOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch' | 'scenarios'> {
|
|
294
|
+
drivers: DriverEntry[];
|
|
295
|
+
baselineSurface: MutableSurface;
|
|
296
|
+
/** The held-out scenarios every winner is scored on. */
|
|
297
|
+
holdoutScenarios: TScenario[];
|
|
298
|
+
/** Scores a surface on a scenario — the same dispatcher the drivers used. */
|
|
299
|
+
dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
300
|
+
/** Bootstrap resamples for the lift CIs. Default 2000. */
|
|
301
|
+
resamples?: number;
|
|
302
|
+
/** CI confidence. Default 0.95. */
|
|
303
|
+
confidence?: number;
|
|
304
|
+
}
|
|
305
|
+
declare function compareDrivers<TScenario extends Scenario, TArtifact>(opts: CompareDriversOptions<TScenario, TArtifact>): Promise<DriverComparison>;
|
|
306
|
+
/** Shared corpus + transport for the three built-in optimizer entries. */
|
|
307
|
+
interface OptimizerEntryConfig<TScenario extends Scenario, TArtifact> {
|
|
308
|
+
baselineSurface: string;
|
|
309
|
+
/** Training scenarios the drivers reflect on. */
|
|
310
|
+
trainScenarios: TScenario[];
|
|
311
|
+
/** Held-out scenarios (the gate axis + the benchmark scoring axis). */
|
|
312
|
+
holdoutScenarios: TScenario[];
|
|
313
|
+
dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
314
|
+
judges: JudgeConfig<TArtifact, TScenario>[];
|
|
315
|
+
llm: LlmClientOptions;
|
|
316
|
+
model: string;
|
|
317
|
+
target: string;
|
|
318
|
+
runDir: string;
|
|
319
|
+
seed?: number;
|
|
320
|
+
/** GEPA population per generation. Default 2. */
|
|
321
|
+
populationSize?: number;
|
|
322
|
+
/** GEPA generations. Default 3. */
|
|
323
|
+
maxGenerations?: number;
|
|
324
|
+
/** SkillOpt epochs. Default 6. */
|
|
325
|
+
maxEpochs?: number;
|
|
326
|
+
mutationPrimitives?: string[];
|
|
327
|
+
}
|
|
328
|
+
/** GEPA, reflection-only (single-parent, no Pareto combine). */
|
|
329
|
+
declare function gepaReflectionEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
|
|
330
|
+
/** GEPA with the Pareto frontier + combine-complementary-lessons. */
|
|
331
|
+
declare function gepaParetoEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
|
|
332
|
+
/** SkillOpt patch-mode hill-climb. */
|
|
333
|
+
declare function skillOptEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
|
|
334
|
+
|
|
80
335
|
/**
|
|
81
336
|
* @experimental
|
|
82
337
|
*
|
|
@@ -215,6 +470,128 @@ interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {
|
|
|
215
470
|
}
|
|
216
471
|
declare function runProfileMatrix<TScenario extends Scenario, TArtifact>(opts: RunProfileMatrixOptions<TScenario, TArtifact>): Promise<RunProfileMatrixResult<TArtifact, TScenario>>;
|
|
217
472
|
|
|
473
|
+
/**
|
|
474
|
+
* @experimental
|
|
475
|
+
*
|
|
476
|
+
* `runSkillOpt` — the SkillOpt epoch hill-climb (Microsoft, arXiv:2605.23904).
|
|
477
|
+
* Unlike `runOptimization`'s population/promote-top-K search, SkillOpt is a
|
|
478
|
+
* sequential, held-out-gated hill-climb on ONE skill document:
|
|
479
|
+
*
|
|
480
|
+
* each epoch:
|
|
481
|
+
* 1. reflect on the CURRENT surface's weakest TRAIN scenarios/dimensions
|
|
482
|
+
* (never the held-out split — proposals must not see the acceptance axis)
|
|
483
|
+
* 2. propose ≤ `patchesPerEpoch` bounded patches (≤ `editBudget` ops each)
|
|
484
|
+
* 3. apply each; score the candidate on the HELD-OUT split
|
|
485
|
+
* 4. ACCEPT the first patch that STRICTLY improves the held-out composite;
|
|
486
|
+
* otherwise push it to the rejected-edit buffer (fed back so the model
|
|
487
|
+
* does not re-propose dead ends)
|
|
488
|
+
* 5. anneal the edit budget down after consecutive rejections (the
|
|
489
|
+
* "textual learning rate" decay); refresh the slow-update meta note
|
|
490
|
+
* 6. stop at `maxEpochs` or after `patience` epochs with no acceptance
|
|
491
|
+
*
|
|
492
|
+
* The accept-only-if-held-out-improves rule is the same discipline as
|
|
493
|
+
* `HeldOutGate`/`defaultProductionGate`, applied per edit instead of once at
|
|
494
|
+
* the end — which is why the held-out composite is monotonically
|
|
495
|
+
* non-decreasing and a regression can never ship. `runCampaign` is the
|
|
496
|
+
* measurement; `applySkillPatch` applies the edits; `skillOptDriver` proposes
|
|
497
|
+
* them.
|
|
498
|
+
*/
|
|
499
|
+
|
|
500
|
+
interface RunSkillOptOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch' | 'scenarios'> {
|
|
501
|
+
/** The skill document being optimized. */
|
|
502
|
+
baselineSurface: string;
|
|
503
|
+
/** Dispatcher taking the CURRENT skill surface + scenario → artifact. */
|
|
504
|
+
dispatchWithSurface: (surface: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
505
|
+
driver: SkillOptDriver;
|
|
506
|
+
/** Scenarios the optimizer reflects on for evidence. MUST be disjoint from
|
|
507
|
+
* `holdoutScenarios` — proposals never see the acceptance axis. */
|
|
508
|
+
trainScenarios: TScenario[];
|
|
509
|
+
/** Held-out scenarios. An edit is accepted ONLY if it strictly improves the
|
|
510
|
+
* mean composite here. */
|
|
511
|
+
holdoutScenarios: TScenario[];
|
|
512
|
+
maxEpochs: number;
|
|
513
|
+
/** Candidate patches proposed per epoch. Default 2. */
|
|
514
|
+
patchesPerEpoch?: number;
|
|
515
|
+
/** Initial ops-per-patch cap (the textual learning rate). Default 3. */
|
|
516
|
+
editBudget?: number;
|
|
517
|
+
/** Strict acceptance margin: accept iff the held-out composite improves by
|
|
518
|
+
* MORE than this. Default 0 (any strict improvement). */
|
|
519
|
+
minImprovement?: number;
|
|
520
|
+
/** Stop after this many consecutive epochs with no acceptance. Default =
|
|
521
|
+
* `maxEpochs` (never early-stop). */
|
|
522
|
+
patience?: number;
|
|
523
|
+
/** Shrink the edit budget by 1 after 2 consecutive rejected epochs (min 1).
|
|
524
|
+
* Default true. */
|
|
525
|
+
budgetAnneal?: boolean;
|
|
526
|
+
/** Cap on the rejected-edit buffer (most-recent kept). Default 12. */
|
|
527
|
+
rejectedBufferSize?: number;
|
|
528
|
+
/** Refresh the slow-update meta note every N epochs. Default 2. 0 disables. */
|
|
529
|
+
slowMetaEvery?: number;
|
|
530
|
+
/** Top-K weak scenarios/dimensions surfaced as evidence each epoch. Default 3. */
|
|
531
|
+
evidenceK?: number;
|
|
532
|
+
/** Abort signal forwarded to the patch-proposing LLM calls. */
|
|
533
|
+
signal?: AbortSignal;
|
|
534
|
+
}
|
|
535
|
+
interface AcceptedEdit {
|
|
536
|
+
epoch: number;
|
|
537
|
+
label: string;
|
|
538
|
+
rationale: string;
|
|
539
|
+
/** Held-out composite improvement vs the surface before this edit. */
|
|
540
|
+
holdoutDelta: number;
|
|
541
|
+
}
|
|
542
|
+
interface SkillOptEpochRecord {
|
|
543
|
+
epoch: number;
|
|
544
|
+
editBudget: number;
|
|
545
|
+
proposed: number;
|
|
546
|
+
/** The accepted edit this epoch, or null if every proposal was rejected. */
|
|
547
|
+
accepted: AcceptedEdit | null;
|
|
548
|
+
rejected: RejectedEdit[];
|
|
549
|
+
/** Held-out composite of the CURRENT surface at the END of the epoch. */
|
|
550
|
+
holdoutComposite: number;
|
|
551
|
+
}
|
|
552
|
+
interface RunSkillOptResult {
|
|
553
|
+
winnerSurface: string;
|
|
554
|
+
baselineHoldoutComposite: number;
|
|
555
|
+
winnerHoldoutComposite: number;
|
|
556
|
+
/** `winnerHoldoutComposite - baselineHoldoutComposite` — monotonically ≥ 0
|
|
557
|
+
* by construction (only strictly-improving edits are accepted). */
|
|
558
|
+
lift: number;
|
|
559
|
+
acceptedEdits: AcceptedEdit[];
|
|
560
|
+
rejectedEdits: RejectedEdit[];
|
|
561
|
+
epochsRun: number;
|
|
562
|
+
history: SkillOptEpochRecord[];
|
|
563
|
+
/** Total cost across every scoring campaign (train evidence + holdout
|
|
564
|
+
* acceptance) the hill-climb ran. */
|
|
565
|
+
totalCostUsd: number;
|
|
566
|
+
}
|
|
567
|
+
declare function runSkillOpt<TScenario extends Scenario, TArtifact>(opts: RunSkillOptOptions<TScenario, TArtifact>): Promise<RunSkillOptResult>;
|
|
568
|
+
|
|
569
|
+
/**
|
|
570
|
+
* @experimental
|
|
571
|
+
*
|
|
572
|
+
* Shared campaign-score reductions used by every optimizer preset
|
|
573
|
+
* (`runOptimization`, `runSkillOpt`, `compareDrivers`). ONE definition of
|
|
574
|
+
* "composite of a campaign" and "per-scenario / per-dimension breakdown" so
|
|
575
|
+
* the optimizers cannot drift on how a surface's score is computed.
|
|
576
|
+
*/
|
|
577
|
+
|
|
578
|
+
/** Mean composite across a campaign: per cell, the mean of its judges'
|
|
579
|
+
* composites; then the mean across cells. Cells with no judge scores are
|
|
580
|
+
* skipped. Empty ⇒ 0. */
|
|
581
|
+
declare function campaignMeanComposite<TArtifact, TScenario extends Scenario>(campaign: CampaignResult<TArtifact, TScenario>): number;
|
|
582
|
+
interface CampaignBreakdown {
|
|
583
|
+
/** Mean score per judge dimension across all cells. */
|
|
584
|
+
dimensions: Record<string, number>;
|
|
585
|
+
/** Per-scenario composite (mean over reps + judges). */
|
|
586
|
+
scenarios: Array<{
|
|
587
|
+
scenarioId: string;
|
|
588
|
+
composite: number;
|
|
589
|
+
}>;
|
|
590
|
+
}
|
|
591
|
+
/** Per-candidate evidence a reflective/patch driver grounds its next proposal
|
|
592
|
+
* on: mean score per judge dimension + per-scenario composite. */
|
|
593
|
+
declare function campaignBreakdown<TArtifact, TScenario extends Scenario>(campaign: CampaignResult<TArtifact, TScenario>): CampaignBreakdown;
|
|
594
|
+
|
|
218
595
|
/**
|
|
219
596
|
* @experimental
|
|
220
597
|
*
|
|
@@ -270,4 +647,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
|
|
|
270
647
|
* as a ref under the adapter's worktree dir. */
|
|
271
648
|
declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
|
|
272
649
|
|
|
273
|
-
export { CampaignResult, CampaignStorage, CodeSurface, DispatchContext, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type RunProfileMatrixOptions, type RunProfileMatrixResult, Scenario, type ScenarioRollup, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath, runProfileMatrix };
|
|
650
|
+
export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, ImprovementDriver, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
|