@tangle-network/agent-eval 0.51.0 → 0.53.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -1
- package/dist/adapters/otel.d.ts +1 -1
- package/dist/campaign/index.d.ts +7 -66
- package/dist/campaign/index.js +5 -122
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-XAP6DJZE.js → chunk-YXD7GWJI.js} +35 -2
- package/dist/chunk-YXD7GWJI.js.map +1 -0
- package/dist/contract/index.d.ts +16 -4
- package/dist/contract/index.js +147 -1
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.d.ts +1 -1
- package/dist/{index-DQHtWQ57.d.ts → index-C7RhhEME.d.ts} +46 -0
- package/dist/openapi.json +1 -1
- package/dist/{run-improvement-loop-BPMjNKMJ.d.ts → run-improvement-loop-Cc7oZlRP.d.ts} +48 -15
- package/docs/design/self-improvement-protocol.md +223 -0
- package/docs/specs/driver-honest-spec.md +251 -0
- package/docs/specs/hermes-self-improvement-audit.md +93 -0
- package/docs/specs/profile-versioning.md +291 -0
- package/package.json +1 -1
- package/dist/chunk-XAP6DJZE.js.map +0 -1
|
@@ -79,25 +79,48 @@ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDrive
|
|
|
79
79
|
*
|
|
80
80
|
* `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
|
|
81
81
|
* Each generation it reflects on the prior best candidate's per-scenario
|
|
82
|
-
* scores + weakest dimensions
|
|
83
|
-
*
|
|
84
|
-
* surface, and returns them as the next population.
|
|
82
|
+
* scores + weakest dimensions, asks an LLM to propose targeted rewrites of
|
|
83
|
+
* the current surface, and returns them as the next population.
|
|
85
84
|
*
|
|
86
|
-
*
|
|
87
|
-
*
|
|
88
|
-
*
|
|
89
|
-
*
|
|
90
|
-
*
|
|
91
|
-
*
|
|
85
|
+
* Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):
|
|
86
|
+
* this driver implements the *reflection* primitive — it does NOT implement
|
|
87
|
+
* GEPA's Pareto frontier of candidates, multi-objective non-dominated
|
|
88
|
+
* tracking, or the combine-complementary-lessons step. We use "best by
|
|
89
|
+
* composite" as the parent each generation; the paper retains a Pareto set
|
|
90
|
+
* and combines lessons across non-dominated candidates. Tracked as #101 in
|
|
91
|
+
* the substrate roadmap. See `docs/specs/driver-honest-spec.md`.
|
|
92
92
|
*
|
|
93
|
-
*
|
|
93
|
+
* Optional `constraints` move structured-doc guards into the driver
|
|
94
|
+
* (preserve H2 section headings, cap sentence-level edits) — useful when
|
|
95
|
+
* the surface IS a structured procedure like a SKILL.md / runbook /
|
|
96
|
+
* judge rubric. When `constraints` is omitted, behavior is unchanged.
|
|
97
|
+
*
|
|
98
|
+
* The driver is surface-agnostic — any string surface in any consumer opts
|
|
99
|
+
* in by selecting it. Reuses the generic reflection primitive
|
|
100
|
+
* (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
|
|
101
|
+
* client; no dependency on the legacy `runMultiShotOptimization` /
|
|
102
|
+
* `prompt-evolution` orchestration.
|
|
103
|
+
*
|
|
104
|
+
* Earns its keep where there is real per-instance signal (which the
|
|
94
105
|
* dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
|
|
95
|
-
* now provide). For thin-signal surfaces it degrades to plain reflection
|
|
96
|
-
*
|
|
97
|
-
*
|
|
98
|
-
* alone.
|
|
106
|
+
* now provide). For thin-signal surfaces it degrades to plain reflection.
|
|
107
|
+
* On generation 0 (no history) it reflects on the current surface against
|
|
108
|
+
* the mutation primitives alone.
|
|
99
109
|
*/
|
|
100
110
|
|
|
111
|
+
interface GepaDriverConstraints {
|
|
112
|
+
/** H2 section headings that MUST appear unchanged in every candidate.
|
|
113
|
+
* When set, the driver auto-detects current H2s if this is empty AND
|
|
114
|
+
* rejects any candidate that drops or renames a preserved heading.
|
|
115
|
+
* Use when the surface is a structured doc (SKILL.md, runbook,
|
|
116
|
+
* sectioned system prompt, judge rubric). */
|
|
117
|
+
preserveSections?: string[];
|
|
118
|
+
/** Maximum sentence-level edits per candidate vs the parent surface.
|
|
119
|
+
* Rejection threshold = maxSentenceEdits × 2 (counts adds + removes).
|
|
120
|
+
* Inspired by SkillOpt's edit-budget as a "textual learning rate."
|
|
121
|
+
* Cap prevents an LLM rewrite from overwriting useful prior rules. */
|
|
122
|
+
maxSentenceEdits?: number;
|
|
123
|
+
}
|
|
101
124
|
interface GepaDriverOptions {
|
|
102
125
|
/** Router transport (apiKey/baseUrl). */
|
|
103
126
|
llm: LlmClientOptions;
|
|
@@ -113,8 +136,18 @@ interface GepaDriverOptions {
|
|
|
113
136
|
temperature?: number;
|
|
114
137
|
/** Reflection max tokens. Default 6000. */
|
|
115
138
|
maxTokens?: number;
|
|
139
|
+
/** Structured-doc constraints. Candidates violating any are rejected
|
|
140
|
+
* post-parse and dropped from the returned population. */
|
|
141
|
+
constraints?: GepaDriverConstraints;
|
|
116
142
|
}
|
|
117
143
|
declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
|
|
144
|
+
/** Extract H2 headings (`## Foo`) from a markdown surface. Exported for
|
|
145
|
+
* consumers building custom mutators that share the same invariant. */
|
|
146
|
+
declare function extractH2Sections(text: string): string[];
|
|
147
|
+
/** Sentence-level edit distance — count distinct add/remove ops between
|
|
148
|
+
* two surfaces via a normalised line-by-line set diff. Treats trivial
|
|
149
|
+
* whitespace as identical. Exported for tests + consumer-side validators. */
|
|
150
|
+
declare function countSentenceEdits(baseline: string, candidate: string): number;
|
|
118
151
|
|
|
119
152
|
/**
|
|
120
153
|
* @experimental
|
|
@@ -414,4 +447,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
|
|
|
414
447
|
}
|
|
415
448
|
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
416
449
|
|
|
417
|
-
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type
|
|
450
|
+
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Self-improvement protocol — the world-class architecture
|
|
2
|
+
|
|
3
|
+
**Status:** Strategic design. The artifact that every roadmap entry maps to.
|
|
4
|
+
**Date:** 2026-05-27.
|
|
5
|
+
|
|
6
|
+
## Thesis
|
|
7
|
+
|
|
8
|
+
**Self-improvement is a protocol, not a product.** We define the wire formats, surface abstractions, driver interface, gate interface, and insight format. We ship reference implementations. Customers plug in whatever framework, model, or runtime they already use — our infrastructure handles the rigorous middle (analysis, gating, version-safe deployment).
|
|
9
|
+
|
|
10
|
+
No competitor ships this combination. LangSmith / Braintrust / Phoenix / LangFuse ship tracing. Hermes ships an agent. SkillOpt ships an academic optimizer. Anthropic's Claude Code ships skill-creation. **Nobody ships the protocol.**
|
|
11
|
+
|
|
12
|
+
## The pipeline as a single abstract flow
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
16
|
+
│ WHATEVER YOU ALREADY USE │
|
|
17
|
+
│ LangChain · LlamaIndex · Anthropic SDK · OpenAI Assistants · │
|
|
18
|
+
│ Hermes · Claude Code · Codex · agent-runtime · your own stack │
|
|
19
|
+
└─────────────────────────────────┬────────────────────────────────────┘
|
|
20
|
+
│ traces (any format)
|
|
21
|
+
▼
|
|
22
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
23
|
+
│ INGEST — universal trace adapters │
|
|
24
|
+
│ fromOtelSpans · fromFeedbackTable · fromLangChain · fromLlamaIndex ·│
|
|
25
|
+
│ fromAnthropicSDK · fromOpenAISDK · fromHermesProfileLog · BYO │
|
|
26
|
+
│ → canonical RunRecord[] │
|
|
27
|
+
└─────────────────────────────────┬────────────────────────────────────┘
|
|
28
|
+
▼
|
|
29
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
30
|
+
│ ANALYZE — analyzeRuns({ runs, baselineRuns?, userFeedback? }) │
|
|
31
|
+
│ paired-bootstrap CI · Pareto · failure clusters · prior-period │
|
|
32
|
+
│ delta · user-corrective-signal extraction · recommendations │
|
|
33
|
+
│ ← THE STATISTICAL EDGE NOBODY ELSE SHIPS │
|
|
34
|
+
└─────────────────────────────────┬────────────────────────────────────┘
|
|
35
|
+
▼
|
|
36
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
37
|
+
│ IMPROVE — selfImprove() closed loop │
|
|
38
|
+
│ gepaDriver · evolutionaryDriver · BYO ImprovementDriver │
|
|
39
|
+
│ → ProfileDiff (versioned, hashed, content-addressable) │
|
|
40
|
+
└─────────────────────────────────┬────────────────────────────────────┘
|
|
41
|
+
▼
|
|
42
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
43
|
+
│ GATE — defaultProductionGate (paired-CI) · BYO gate │
|
|
44
|
+
│ ship-substrate / ship-harness / merge / inconclusive │
|
|
45
|
+
│ ← STATISTICALLY STRICTER THAN ANY COMPETITOR │
|
|
46
|
+
└─────────────────────────────────┬────────────────────────────────────┘
|
|
47
|
+
▼
|
|
48
|
+
┌──────────────────────────────────────────────────────────────────────┐
|
|
49
|
+
│ DEPLOY — back into WHATEVER YOU ALREADY USE │
|
|
50
|
+
│ agent-runtime · Hermes profile log · LangChain config · custom hook │
|
|
51
|
+
└──────────────────────────────────────────────────────────────────────┘
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## The integration promise
|
|
55
|
+
|
|
56
|
+
Customers pick one of three integration shapes. All three work today (some are aspirational on adapter coverage). Every shape uses the same canonical types underneath.
|
|
57
|
+
|
|
58
|
+
### Shape A — offline analysis only
|
|
59
|
+
|
|
60
|
+
You have traces, you want a decision packet. Zero LLM cost. Zero closed loop.
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { fromOtelSpans, analyzeRuns } from '@tangle-network/agent-eval'
|
|
64
|
+
|
|
65
|
+
const runs = fromOtelSpans({ spans: mySpans })
|
|
66
|
+
const report = await analyzeRuns({ runs })
|
|
67
|
+
// → InsightReport with composite, recommendations, Pareto, ...
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Use case: dashboards, weekly post-mortems, "did anything regress" checks. The intelligence-kernel ships this.
|
|
71
|
+
|
|
72
|
+
### Shape B — closed loop, your runtime
|
|
73
|
+
|
|
74
|
+
You have an agent, you want to improve it. We provide drivers + gate + insight. You decide when to deploy.
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
import { selfImprove, gepaDriver } from '@tangle-network/agent-eval'
|
|
78
|
+
|
|
79
|
+
const result = await selfImprove({
|
|
80
|
+
scenarios,
|
|
81
|
+
agent: yourAgent, // any function (surface, scenario) → artifact
|
|
82
|
+
judge: yourJudge, // any function (artifact) → JudgeScore
|
|
83
|
+
baselineSurface,
|
|
84
|
+
driver: gepaDriver({ llm, model, target }),
|
|
85
|
+
budget: { generations: 3, populationSize: 4, holdoutFraction: 0.3 },
|
|
86
|
+
})
|
|
87
|
+
// → SelfImproveResult { baselineHash, diff, winningHash, lift, gateDecision, insight }
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Use case: every product agent we ship. Hermes-on-our-sandbox. Claude Code with skills. Anyone wanting "ship if statistically better, else hold."
|
|
91
|
+
|
|
92
|
+
### Shape C — hosted, cross-language
|
|
93
|
+
|
|
94
|
+
You stream traces from anywhere, get InsightReports + selfImprove orchestration. Bills usage-based.
|
|
95
|
+
|
|
96
|
+
```sh
|
|
97
|
+
# Stream traces
|
|
98
|
+
curl https://api.tangle.tools/v1/ingest/otel \
|
|
99
|
+
-H "Authorization: Bearer ${TANGLE_KEY}" \
|
|
100
|
+
--data-binary @traces.jsonl
|
|
101
|
+
|
|
102
|
+
# Get the decision packet
|
|
103
|
+
curl https://api.tangle.tools/v1/insight/${runId}
|
|
104
|
+
|
|
105
|
+
# Or run a closed-loop campaign
|
|
106
|
+
curl https://api.tangle.tools/v1/improve \
|
|
107
|
+
-d '{"scenarios": ..., "baselineHash": "...", "budget": {...}}'
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Use case: Python customers, Go customers, customers behind firewalls, customers who don't want to operate the substrate.
|
|
111
|
+
|
|
112
|
+
## The five non-negotiables
|
|
113
|
+
|
|
114
|
+
The protocol claim only holds if all five of these survive integration. Customers shouldn't have to compromise on any.
|
|
115
|
+
|
|
116
|
+
1. **Universal ingest.** Any trace format → canonical RunRecord. Coverage: OTel ✓, multi-rater feedback ✓, LangChain ⏳, LlamaIndex ⏳, Anthropic SDK ⏳, OpenAI Assistants ⏳, Hermes profile log ⏳.
|
|
117
|
+
2. **Statistical rigor.** Every claim falsifiable. Paired bootstrap CI on lift, Cohen's d on effect size, MDE-aware sample-size recommendations, p-values. **SkillOpt's gate is literal `cand > current`. Hermes has no gate. Ours has all of the above.** This is the moat.
|
|
118
|
+
3. **Plug-in everything.** Driver, judge, gate, intake adapter, storage all swappable. Customer brings their LLM, their judge, their scenarios. We bring the rigor.
|
|
119
|
+
4. **Version-safe deployment.** AgentProfile is content-addressable. Two writers (harness + substrate) can both mutate without lost-update. Gate verdicts are scoped to baseline hash, not absolute. Tracked as #98.
|
|
120
|
+
5. **Cross-language wire format.** Python client at parity with TypeScript. Hosted ingest spec versioned. Customers in any language consume the same shape.
|
|
121
|
+
|
|
122
|
+
## Where we are honest about gaps
|
|
123
|
+
|
|
124
|
+
| Component | Status | Customer impact when missing |
|
|
125
|
+
|---|---|---|
|
|
126
|
+
| `fromOtelSpans` ingest adapter | ✓ shipped 0.50.0 | — |
|
|
127
|
+
| `fromFeedbackTable` multi-rater intake | ✓ shipped 0.50.0 | — |
|
|
128
|
+
| `analyzeRuns` decision packet | ✓ shipped 0.50.0 / 0.50.2 actionability | — |
|
|
129
|
+
| `selfImprove` closed loop | ✓ shipped 0.50.0 | — |
|
|
130
|
+
| Paired-bootstrap gate | ✓ shipped early; still our edge | — |
|
|
131
|
+
| `gepaDriver` reflection (not full Pareto — task #101) | ⚠ partial | OK; customers don't need Pareto until plateau hit |
|
|
132
|
+
| **Prior-period comparison** in `analyzeRuns` | ✗ MISSING | "Did my last change help?" — the #1 customer question — has no rigorous answer today |
|
|
133
|
+
| **User-corrective-feedback signal extraction** | ✗ MISSING | Hermes' first-class skill signal. We have the trace data. We don't mine it. |
|
|
134
|
+
| **`init` CLI** scaffolding canonical eval/ layout | ✗ MISSING | Every new consumer wires it by hand; the skill describes 80 lines they have to copy |
|
|
135
|
+
| **Framework-specific intake adapters** (LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants) | ✗ MISSING | Customers using these frameworks can't ingest without writing custom adapter code |
|
|
136
|
+
| **Profile versioning** (task #98) | ✗ MISSING | Offline/online drift; gate verdicts can be stale by the time they're applied |
|
|
137
|
+
| **Composite driver** (optimize all surfaces against one gate) | ✗ MISSING | Customers can optimize prompts OR skills, not both jointly |
|
|
138
|
+
| **Empirical proof drivers work** | ✗ MISSING | We've never published "we ran gepaDriver on real customer data, here's the lift CI" |
|
|
139
|
+
| Hosted-tier production launch | ⚠ in scaffolding (intelligence-kernel) | Customers must self-host today |
|
|
140
|
+
|
|
141
|
+
## The roadmap — what closes each gap
|
|
142
|
+
|
|
143
|
+
Mapping every roadmap entry back to a concrete protocol gap.
|
|
144
|
+
|
|
145
|
+
### 0.53.0 (this session-or-next) — answer "did my last change help?"
|
|
146
|
+
|
|
147
|
+
- **`analyzeRuns({ runs, baselineRuns? })`** — when `baselineRuns` is provided, the report includes a `priorPeriodComparison?` block: per-metric delta with paired-bootstrap CI, MDE-aware significance judgment, "regressed metrics" surfaced in `recommendations`.
|
|
148
|
+
- Built on top of existing `diffRuns()` primitive (already shipped 0.48.0).
|
|
149
|
+
- 1 PR. Pure additive surface.
|
|
150
|
+
- **Customer impact**: this is the conversion question for every prospect.
|
|
151
|
+
|
|
152
|
+
### 0.54.0 — extract Hermes' missing signal
|
|
153
|
+
|
|
154
|
+
- **`extractUserCorrections(runs)`** — new substrate primitive. Mines user messages in traces for corrective markers (regex pass + LLM classifier for nuance). Returns `UserCorrectionEvent[]` keyed by runId.
|
|
155
|
+
- `analyzeRuns({ runs, userFeedback? })` includes a "common corrections" cluster in `recommendations`.
|
|
156
|
+
- Bridge to Hermes-style signal without adopting Hermes' runtime.
|
|
157
|
+
- **Customer impact**: distinctive — no competitor mines this signal.
|
|
158
|
+
|
|
159
|
+
### 0.55.0 — framework-specific intake adapters
|
|
160
|
+
|
|
161
|
+
- **`fromLangChain(traces)`**, **`fromLlamaIndex(traces)`**, **`fromAnthropicSDK(traces)`**, **`fromOpenAIAssistants(traces)`**.
|
|
162
|
+
- Each maps the framework's native trace shape to RunRecord.
|
|
163
|
+
- Top 4 frameworks = 80% of agent-builder market coverage.
|
|
164
|
+
- **Customer impact**: removes "we don't support your framework" friction.
|
|
165
|
+
|
|
166
|
+
### 0.56.0 — `init` CLI + worked examples
|
|
167
|
+
|
|
168
|
+
- `pnpm dlx @tangle-network/agent-eval init` scaffolds the canonical `eval/scenarios.json` + 3 pnpm scripts + judges template + `.runs/` directory.
|
|
169
|
+
- Adds 5+ end-to-end runnable examples covering Shapes A/B/C across the 4 framework adapters.
|
|
170
|
+
- **Customer impact**: time-to-first-eval drops from 4 hours to 5 minutes.
|
|
171
|
+
|
|
172
|
+
### 1.0.0 — profile versioning (#98) + composite driver
|
|
173
|
+
|
|
174
|
+
- Content-addressable `AgentProfileVersion` + `ProfileDiff` + 3-way merge + 4-way `DriftGateDecision`.
|
|
175
|
+
- `compositeDriver` — optimize all surfaces of one AgentProfile against one gate.
|
|
176
|
+
- Hermes-on-sandbox forcing function validates the work before commit.
|
|
177
|
+
- **Customer impact**: production-safe; the moat is locked.
|
|
178
|
+
|
|
179
|
+
### 1.1.0 — empirical-proof publication
|
|
180
|
+
|
|
181
|
+
- Pick one named customer or one synthetic-realistic corpus (legal-agent canonical).
|
|
182
|
+
- Run gepaDriver end-to-end with real LLM cost.
|
|
183
|
+
- Publish: "n=, lift=, CI=, p=, $cost=, vs no-driver baseline."
|
|
184
|
+
- One blog post, one demo video, one runnable repro.
|
|
185
|
+
- **Customer impact**: every other claim becomes credible because this one is verified.
|
|
186
|
+
|
|
187
|
+
## Why this design is 100x
|
|
188
|
+
|
|
189
|
+
Not a 10% improvement over LangSmith. A category change.
|
|
190
|
+
|
|
191
|
+
| Capability | LangSmith / Braintrust / Phoenix | Hermes / Claude Code | Tangle (target) |
|
|
192
|
+
|---|---|---|---|
|
|
193
|
+
| Trace ingest | ✓ proprietary | ✓ own runtime | ✓ universal |
|
|
194
|
+
| Decision packet | ⚠ scorecards (no CI) | ✗ | ✓ paired-bootstrap |
|
|
195
|
+
| Closed loop | ✗ | ✓ heuristic | ✓ statistically rigorous |
|
|
196
|
+
| Plug-in drivers | ✗ | ✗ | ✓ |
|
|
197
|
+
| Profile versioning | ✗ | ✗ | ✓ (1.0.0) |
|
|
198
|
+
| Composite multi-surface | ✗ | ✗ | ✓ (1.0.0) |
|
|
199
|
+
| Cross-language | ✗ | ✗ | ✓ (Python at parity) |
|
|
200
|
+
| Empirical-proof publication | ✗ | ✗ | ✓ (1.1.0) |
|
|
201
|
+
|
|
202
|
+
Eight rows. Nobody else has eight. We can be the only one. The work is named, scoped, and queued.
|
|
203
|
+
|
|
204
|
+
## What's NOT on the roadmap (and why)
|
|
205
|
+
|
|
206
|
+
- **Building our own agent runtime.** Hermes / agent-runtime / Claude Code cover that. We are infrastructure, not a runtime.
|
|
207
|
+
- **Single-vendor LLM.** Substrate stays model-agnostic.
|
|
208
|
+
- **UI-first product.** API-first. UIs are downstream.
|
|
209
|
+
- **LangChain replacement.** Wrong layer.
|
|
210
|
+
- **"Self-improvement" without a held-out gate.** Hermes and SkillOpt both ship this; we explicitly refuse — every selfImprove() requires a holdout.
|
|
211
|
+
|
|
212
|
+
## Decision log — what we committed to in 0.52.0 → 1.0.0
|
|
213
|
+
|
|
214
|
+
1. **`skillOptDriver` removed; behavior in `gepaDriver({ constraints })`** — 0.52.0 ✓ shipped
|
|
215
|
+
2. **Honest spec docs** — 0.52.0 ✓ shipped
|
|
216
|
+
3. **Profile-versioning spec with symmetric-fork framing** — 0.52.0 ✓ shipped
|
|
217
|
+
4. **No V2 names anywhere** — enforced
|
|
218
|
+
5. **Forcing-function gate on profile-versioning work** — Hermes-on-sandbox experiment required before phases 1-5 commit
|
|
219
|
+
6. **Single-PR-per-repo discipline** — enforced 0.52.0 onwards
|
|
220
|
+
7. **Prior-period comparison as 0.53.0** — committed; the customer-conversion primitive
|
|
221
|
+
8. **User-feedback extraction as 0.54.0** — committed; the Hermes-signal bridge
|
|
222
|
+
9. **Framework intake adapters as 0.55.0** — committed; 80% market coverage
|
|
223
|
+
10. **Empirical-proof publication as 1.1.0** — committed; the credibility lock
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# Driver Honest Spec — what each driver IS, what each methodology actually is, where we deviate
|
|
2
|
+
|
|
3
|
+
**Status:** Living document. Updated when we learn the truth from primary sources.
|
|
4
|
+
**Date:** 2026-05-27
|
|
5
|
+
|
|
6
|
+
This document exists because the project shipped two drivers with methodology names attached (`gepaDriver`, `skillOptDriver`) without the methodology specs being precisely encoded anywhere in the repo. That created an integrity gap. This doc closes it.
|
|
7
|
+
|
|
8
|
+
Every claim in this doc is sourced from a primary reference (paper, code, or directly verifiable from our source). Marketing language is forbidden. If something is not implemented we say so.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Part 1 — GEPA (the paper)
|
|
13
|
+
|
|
14
|
+
**Source**: Agrawal et al., *"GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning"*, arXiv:2507.19457, July 2025.
|
|
15
|
+
|
|
16
|
+
### What GEPA actually does
|
|
17
|
+
|
|
18
|
+
Outer loop (verbatim from abstract): "samples trajectories (e.g., reasoning, tool calls, and tool outputs) and reflects on them in natural language to diagnose problems, propose and test prompt updates, and combine complementary lessons from the **Pareto frontier of its own attempts**."
|
|
19
|
+
|
|
20
|
+
Named primitives in the paper:
|
|
21
|
+
- **GEPA** (Genetic-Pareto) — the overall optimizer
|
|
22
|
+
- **Pareto frontier** — non-dominated candidate set retained across iterations
|
|
23
|
+
- **Prompt updates** — mutations proposed by reflection
|
|
24
|
+
- **Rollouts** — trajectory samples
|
|
25
|
+
|
|
26
|
+
### What gepaDriver in our substrate ACTUALLY does
|
|
27
|
+
|
|
28
|
+
Source: `src/campaign/drivers/gepa.ts` (132 lines)
|
|
29
|
+
|
|
30
|
+
- Single LLM call per `propose()` invocation
|
|
31
|
+
- Input: prior generation's **single best candidate by composite score** + that candidate's top/bottom scenarios + 3 weakest dimensions (`buildEvidence`)
|
|
32
|
+
- Output: N proposals, each a full document rewrite
|
|
33
|
+
- Dedup by exact text equality
|
|
34
|
+
|
|
35
|
+
### Deviations from the GEPA paper
|
|
36
|
+
|
|
37
|
+
| GEPA paper | Our `gepaDriver` |
|
|
38
|
+
|---|---|
|
|
39
|
+
| **Pareto frontier** of candidates | **Single "best by composite"** — no Pareto set, no non-dominated tracking |
|
|
40
|
+
| **Combine complementary lessons** from frontier | Each generation reflects on ONE prior candidate; no combination |
|
|
41
|
+
| Multi-objective optimization | Single-objective (composite score) |
|
|
42
|
+
| Genetic operators (mutation, crossover) | Reflection only — no crossover |
|
|
43
|
+
| Sample efficiency claim (35× fewer rollouts than GRPO) | Unmeasured against any baseline |
|
|
44
|
+
|
|
45
|
+
**Honest assessment**: our `gepaDriver` is a **reflective full-rewrite driver**, not GEPA. It captures GEPA's *reflection* primitive but not its *Pareto* mechanism. The name oversells. A faithful renaming would be `reflectiveRewriteDriver`. A faithful implementation would add a Pareto candidate pool + combine step.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Part 2 — SkillOpt (the paper + code)
|
|
50
|
+
|
|
51
|
+
**Source**:
|
|
52
|
+
- README: https://github.com/microsoft/SkillOpt
|
|
53
|
+
- Source: `/tmp/SkillOpt/skillopt/` (cloned 2026-05-27)
|
|
54
|
+
- Key files: `engine/trainer.py`, `optimizer/clip.py` (rank_and_select), `optimizer/update_modes.py`, `evaluation/gate.py`, `types.py`
|
|
55
|
+
|
|
56
|
+
### What SkillOpt actually does
|
|
57
|
+
|
|
58
|
+
**6-stage per-step pipeline** (verbatim from `trainer.py:516` and adjacent):
|
|
59
|
+
|
|
60
|
+
1. **Rollout** — `adapter.rollout(train_env, current_skill, ...)` collects trajectories on a batch.
|
|
61
|
+
2. **Reflect** — `adapter.reflect()` analyses trajectories and emits **structured patches** (NOT full rewrites in patch mode). Failure trials → failure patches; success trials → success patches.
|
|
62
|
+
3. **Aggregate** — `merge_patches(current_skill, all_failure_patches, all_success_patches, batch_size=merge_bs)` — hierarchically merges patches across accumulated batches.
|
|
63
|
+
4. **Select** — `rank_and_select(current_skill, merged_patch, max_edits=edit_budget)` — if edit pool > budget, calls an optimizer LLM to **rank edits by importance** and keep top-L. Budget is "analogous to gradient clipping" (their words).
|
|
64
|
+
5. **Update** — apply patch in one of 3 modes:
|
|
65
|
+
- **`patch`** — deterministic diff apply via `apply_patch_with_report()`; ops are `append | insert_after | replace | delete`
|
|
66
|
+
- **`rewrite_from_suggestions`** — LLM regenerates full skill from suggestions
|
|
67
|
+
- **`full_rewrite_minibatch`** — reflection directly emits complete candidate skills; select picks the best
|
|
68
|
+
6. **Evaluate & Gate** — runs candidate on selection set, calls `evaluate_gate(cand_hard, current_score, best_score)`. Returns `accept_new_best | accept | reject` from a **literal `cand_hard > current_score`** comparison (`evaluation/gate.py:38`). No statistical test.
|
|
69
|
+
|
|
70
|
+
Plus epoch-level stages:
|
|
71
|
+
- **Slow update** — `run_slow_update()` builds longitudinal pairs across epochs.
|
|
72
|
+
- **Meta skill** — `run_meta_skill()` produces optimizer-side memory of patterns across adjacent epochs.
|
|
73
|
+
|
|
74
|
+
### Canonical patch shape (from `types.py:22-45`)
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
EditOp = Literal["append", "insert_after", "replace", "delete"]
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class Edit:
|
|
81
|
+
op: EditOp
|
|
82
|
+
content: str
|
|
83
|
+
target: str # for replace/delete/insert_after
|
|
84
|
+
support_count: int | None # how many trials voted for this edit
|
|
85
|
+
source_type: Literal["failure", "success"] | None
|
|
86
|
+
merge_level: int | None
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class Patch:
|
|
90
|
+
edits: list[Edit]
|
|
91
|
+
reasoning: str
|
|
92
|
+
ranking_details: dict | None
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### What `skillOptDriver` v0.51.0 in our substrate ACTUALLY does
|
|
96
|
+
|
|
97
|
+
Source: `src/campaign/drivers/skillopt.ts` (current as of 0.51.0)
|
|
98
|
+
|
|
99
|
+
- Single LLM call per `propose()` returning N full document rewrites
|
|
100
|
+
- Post-parse rejection on: (a) any H2 header dropped, (b) sentence-edit count > editBudget × 2
|
|
101
|
+
- Substantively equivalent to `gepaDriver` + 2 validation constraints
|
|
102
|
+
|
|
103
|
+
### Deviations from SkillOpt
|
|
104
|
+
|
|
105
|
+
| SkillOpt actual | Our 0.51.0 `skillOptDriver` |
|
|
106
|
+
|---|---|
|
|
107
|
+
| 6-stage pipeline (rollout → reflect → aggregate → select → update → gate) | Single LLM call → N rewrites |
|
|
108
|
+
| **Patch-based edits** (`{op, target, content, support_count, source_type}`) | Full document rewrites only |
|
|
109
|
+
| `merge_patches()` hierarchical merge across batches | No aggregation; each `propose()` is independent |
|
|
110
|
+
| `rank_and_select(max_edits=edit_budget)` LLM-ranking of edits | All candidates that pass validation are returned |
|
|
111
|
+
| 3 update modes (`patch`, `rewrite_from_suggestions`, `full_rewrite_minibatch`) | Only `full_rewrite_minibatch`-equivalent |
|
|
112
|
+
| `evaluate_gate()` with `accept_new_best/accept/reject` codes | Substrate's outer gate decides ship/hold/inspect; driver doesn't see fine-grained accept signal |
|
|
113
|
+
| Longitudinal `slow_update` across epochs | Not implemented |
|
|
114
|
+
| `meta_skill` optimizer-side memory | Not implemented |
|
|
115
|
+
| Selection-set cache (`sel_cache`) for repeated candidate hashes | Not implemented |
|
|
116
|
+
| Edit-budget LR scheduler (constant / linear / cosine / autonomous) | Single fixed `editBudget` |
|
|
117
|
+
| Mini-batch accumulation (`steps_per_epoch`, `merge_batch_size`) | Not implemented |
|
|
118
|
+
| `decide_autonomous_learning_rate()` | Not implemented |
|
|
119
|
+
| `longitudinal_pair_policy` (mixed / changed / unchanged) | Not implemented |
|
|
120
|
+
|
|
121
|
+
**Honest assessment**: 13 substantive deviations. `skillOptDriver` 0.51.0 is **not** SkillOpt. It is `gepaDriver` with two post-validation constraints (section preservation, sentence-edit count). The methodology name oversells the implementation.
|
|
122
|
+
|
|
123
|
+
### One thing where we are STRICTER than SkillOpt
|
|
124
|
+
|
|
125
|
+
**The gate.** SkillOpt: literal `cand_hard > current_score` (`evaluation/gate.py:38`). Our substrate: paired bootstrap + 95% CI + Cohen's d + MDE + p-value (`defaultProductionGate`). When the lift CI straddles zero, our gate returns `hold` / `inspect`. SkillOpt would accept any improvement at all, even single-sample noise.
|
|
126
|
+
|
|
127
|
+
This is real differentiation we have not been crediting ourselves for.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Part 3 — Hermes Agent's "self-improvement"
|
|
132
|
+
|
|
133
|
+
**Source**: `/tmp/hermes-agent/` (cloned 2026-05-27)
|
|
134
|
+
- `agent/curator.py` (the actual loop)
|
|
135
|
+
- `agent/skill_commands.py`
|
|
136
|
+
- `agent/skill_utils.py`
|
|
137
|
+
|
|
138
|
+
### What Hermes actually does
|
|
139
|
+
|
|
140
|
+
From `curator.py` line 1: "Curator — background skill maintenance orchestrator. The curator is an auxiliary-model task that periodically reviews agent-created skills and maintains the collection."
|
|
141
|
+
|
|
142
|
+
Trigger: idle-driven, with default `DEFAULT_INTERVAL_HOURS = 24 * 7` (7 days). When the agent has been idle for `DEFAULT_MIN_IDLE_HOURS = 2` and the last curator run was > 7 days ago, `maybe_run_curator()` spawns a forked AIAgent.
|
|
143
|
+
|
|
144
|
+
What the curator does:
|
|
145
|
+
- "Auto-transition lifecycle states based on derived skill activity timestamps"
|
|
146
|
+
- "Spawn a background review agent that can **pin / archive / consolidate / patch** agent-created skills via `skill_manage`"
|
|
147
|
+
- "Persist curator state (last_run_at, paused, etc.) in `.curator_state`"
|
|
148
|
+
|
|
149
|
+
Strict invariants:
|
|
150
|
+
- Only touches agent-created skills
|
|
151
|
+
- "Never auto-deletes — only archives"
|
|
152
|
+
- Pinned skills bypass auto-transitions
|
|
153
|
+
- Uses the auxiliary client (separate from main session)
|
|
154
|
+
|
|
155
|
+
### Hermes' actual gate
|
|
156
|
+
|
|
157
|
+
**There is none.** The curator is an LLM editor making editorial decisions. There is no:
|
|
158
|
+
- Held-out validation
|
|
159
|
+
- Performance comparison between old and new skill versions
|
|
160
|
+
- Statistical test
|
|
161
|
+
- Rejection-on-regression mechanism
|
|
162
|
+
|
|
163
|
+
Skills are refined by an LLM looking at usage patterns; the refinement is accepted because the LLM proposed it.
|
|
164
|
+
|
|
165
|
+
### Honest assessment
|
|
166
|
+
|
|
167
|
+
Hermes has a **skill curation system**, not a self-improvement loop. The README's claim "the only agent with a built-in learning loop" is generous — it's a 7-day-cron LLM librarian. There's no measurable guarantee that today's curated skill collection performs better than yesterday's.
|
|
168
|
+
|
|
169
|
+
Compare:
|
|
170
|
+
| Component | Hermes | SkillOpt | Tangle |
|
|
171
|
+
|---|---|---|---|
|
|
172
|
+
| Validation gate | None | `>` | Paired bootstrap CI |
|
|
173
|
+
| Patch-level edits | No (LLM rewrites whole skill) | Yes | No (full rewrite only) |
|
|
174
|
+
| Skill ranking / selection | No | Yes | No |
|
|
175
|
+
| Sample efficiency claim | None | 35× vs GRPO | None |
|
|
176
|
+
| Frequency | 7-day cron | Per training step | Per `selfImprove()` call |
|
|
177
|
+
|
|
178
|
+
Where Tangle WINS: the gate. Where SkillOpt WINS: the pipeline sophistication. Where Hermes WINS: the deployment story (multi-platform, multi-tool-backend).
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Part 4 — What we should actually do
|
|
183
|
+
|
|
184
|
+
### Phase A — rename to honest names (0.51.1, this session)
|
|
185
|
+
|
|
186
|
+
The current `skillOptDriver` and `gepaDriver` names overclaim. Options:
|
|
187
|
+
|
|
188
|
+
1. **Rename both:**
|
|
189
|
+
- `gepaDriver` → `reflectiveRewriteDriver` (drops the "Pareto" implication)
|
|
190
|
+
- `skillOptDriver` → `constrainedReflectiveDriver` (drops the SkillOpt-methodology implication)
|
|
191
|
+
- Reserve `gepaDriver` + `skillOptDriver` for faithful implementations
|
|
192
|
+
2. **Keep `gepaDriver` name** (it's our most-used driver; renaming is disruptive); rename `skillOptDriver`.
|
|
193
|
+
3. **Keep both names; add `@experimental` + a "differs from paper" docstring section.** Cheapest. Truthful enough.
|
|
194
|
+
|
|
195
|
+
Recommendation: **option 3 plus a frontmatter "deviations from paper" section** in each driver source file. Empirically test before renaming.
|
|
196
|
+
|
|
197
|
+
### Phase B — build the honest empirical harness (0.51.1, this session)
|
|
198
|
+
|
|
199
|
+
`tests/driver-empirical.bench.ts` — for each driver:
|
|
200
|
+
- Same scenarios (5 synthetic + 5 real legal-agent scenarios)
|
|
201
|
+
- Same judge
|
|
202
|
+
- Same `baselineSurface`
|
|
203
|
+
- Same `budget` (1 gen, 3 candidates, holdout 0.3)
|
|
204
|
+
- Report: lift mean, lift CI95, p-value, rollouts spent, $$ spent
|
|
205
|
+
|
|
206
|
+
Drivers in the matrix:
|
|
207
|
+
- `gepaDriver` (current full-rewrite reflection)
|
|
208
|
+
- `skillOptDriver` (current 0.51.0 full-rewrite + constraints)
|
|
209
|
+
- Future: real `skillOptDriverV2` with patch mode
|
|
210
|
+
|
|
211
|
+
This is the **falsifiable test** of whether our drivers' methodology claims are worth the names.
|
|
212
|
+
|
|
213
|
+
### Phase C — implement SkillOpt patch mode for real (0.52.0)
|
|
214
|
+
|
|
215
|
+
Build `skillOptDriverV2` with:
|
|
216
|
+
1. **`Edit` type matching SkillOpt's**: `{op: 'append'|'insert_after'|'replace'|'delete', content, target?, support_count?, source_type?}`
|
|
217
|
+
2. **Reflect step emits patches**, not full rewrites
|
|
218
|
+
3. **`mergePatches()`** — LLM-driven hierarchical merge of failure + success patches
|
|
219
|
+
4. **`rankAndSelect()`** — LLM-driven ranking when edit pool > budget
|
|
220
|
+
5. **Deterministic `applyPatch()`** — string ops, no LLM
|
|
221
|
+
6. **Keep our gate** (paired bootstrap CI). Don't downgrade to SkillOpt's `>` — that's our edge.
|
|
222
|
+
|
|
223
|
+
Estimated scope: 400-600 lines + tests.
|
|
224
|
+
|
|
225
|
+
### Phase D — implement GEPA's Pareto frontier (0.53.0)
|
|
226
|
+
|
|
227
|
+
Build `gepaDriverV2` with:
|
|
228
|
+
1. **Candidate pool** retained across generations (non-dominated)
|
|
229
|
+
2. **Multi-objective evaluation** (composite + cost + length + diversity)
|
|
230
|
+
3. **Combine step** — LLM combines lessons from non-dominated candidates
|
|
231
|
+
4. Keep reflection.
|
|
232
|
+
5. Sample-efficiency target: match the paper's ~35× claim on a benchmark we choose.
|
|
233
|
+
|
|
234
|
+
Estimated scope: 500-800 lines + tests.
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Source pointers (audit trail)
|
|
239
|
+
|
|
240
|
+
- GEPA paper: https://arxiv.org/abs/2507.19457
|
|
241
|
+
- SkillOpt repo: https://github.com/microsoft/SkillOpt (cloned at `/tmp/SkillOpt/` 2026-05-27)
|
|
242
|
+
- Hermes repo: https://github.com/NousResearch/hermes-agent (cloned at `/tmp/hermes-agent/` 2026-05-27)
|
|
243
|
+
- Our gepaDriver: `src/campaign/drivers/gepa.ts`
|
|
244
|
+
- Our skillOptDriver: `src/campaign/drivers/skillopt.ts`
|
|
245
|
+
- Our gate: `src/campaign/gates/default-production-gate.ts`
|
|
246
|
+
- Our reflection primitive: `src/reflective-mutation.ts`
|
|
247
|
+
|
|
248
|
+
Update this doc when:
|
|
249
|
+
- We discover new behavior in any of the upstream methods (via reading their code, not their READMEs)
|
|
250
|
+
- We ship a driver that closes one of the named gaps
|
|
251
|
+
- We run the empirical harness and have real numbers to add
|