@tangle-network/agent-eval 0.51.0 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,25 +79,48 @@ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDrive
79
79
  *
80
80
  * `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
81
81
  * Each generation it reflects on the prior best candidate's per-scenario
82
- * scores + weakest dimensions (the `GenerationCandidate` evidence from
83
- * `runOptimization`), asks an LLM to propose targeted rewrites of the current
84
- * surface, and returns them as the next population.
82
+ * scores + weakest dimensions, asks an LLM to propose targeted rewrites of
83
+ * the current surface, and returns them as the next population.
85
84
  *
86
- * This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
87
- * ANY string surface in ANY consumer opts in by selecting it system prompts,
88
- * prompt addenda, judge/reviewer prompts, even a driver's own reflection
89
- * prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
90
- * `parseReflectionResponse`) and the router client; it has NO dependency on the
91
- * legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
85
+ * Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):
86
+ * this driver implements the *reflection* primitive it does NOT implement
87
+ * GEPA's Pareto frontier of candidates, multi-objective non-dominated
88
+ * tracking, or the combine-complementary-lessons step. We use "best by
89
+ * composite" as the parent each generation; the paper retains a Pareto set
90
+ * and combines lessons across non-dominated candidates. Tracked as #101 in
91
+ * the substrate roadmap. See `docs/specs/driver-honest-spec.md`.
92
92
  *
93
- * It earns its keep where there is real per-instance signal (which the
93
+ * Optional `constraints` move structured-doc guards into the driver
94
+ * (preserve H2 section headings, cap sentence-level edits) — useful when
95
+ * the surface IS a structured procedure like a SKILL.md / runbook /
96
+ * judge rubric. When `constraints` is omitted, behavior is unchanged.
97
+ *
98
+ * The driver is surface-agnostic — any string surface in any consumer opts
99
+ * in by selecting it. Reuses the generic reflection primitive
100
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
101
+ * client; no dependency on the legacy `runMultiShotOptimization` /
102
+ * `prompt-evolution` orchestration.
103
+ *
104
+ * Earns its keep where there is real per-instance signal (which the
94
105
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
95
- * now provide). For thin-signal surfaces it degrades to plain reflection — so
96
- * it is a SELECTABLE driver, never a forced default. On generation 0 (no
97
- * history) it reflects on the current surface against the mutation primitives
98
- * alone.
106
+ * now provide). For thin-signal surfaces it degrades to plain reflection.
107
+ * On generation 0 (no history) it reflects on the current surface against
108
+ * the mutation primitives alone.
99
109
  */
100
110
 
111
+ interface GepaDriverConstraints {
112
+ /** H2 section headings that MUST appear unchanged in every candidate.
113
+ * When set, the driver auto-detects current H2s if this is empty AND
114
+ * rejects any candidate that drops or renames a preserved heading.
115
+ * Use when the surface is a structured doc (SKILL.md, runbook,
116
+ * sectioned system prompt, judge rubric). */
117
+ preserveSections?: string[];
118
+ /** Maximum sentence-level edits per candidate vs the parent surface.
119
+ * Rejection threshold = maxSentenceEdits × 2 (counts adds + removes).
120
+ * Inspired by SkillOpt's edit-budget as a "textual learning rate."
121
+ * Cap prevents an LLM rewrite from overwriting useful prior rules. */
122
+ maxSentenceEdits?: number;
123
+ }
101
124
  interface GepaDriverOptions {
102
125
  /** Router transport (apiKey/baseUrl). */
103
126
  llm: LlmClientOptions;
@@ -113,8 +136,18 @@ interface GepaDriverOptions {
113
136
  temperature?: number;
114
137
  /** Reflection max tokens. Default 6000. */
115
138
  maxTokens?: number;
139
+ /** Structured-doc constraints. Candidates violating any are rejected
140
+ * post-parse and dropped from the returned population. */
141
+ constraints?: GepaDriverConstraints;
116
142
  }
117
143
  declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
144
+ /** Extract H2 headings (`## Foo`) from a markdown surface. Exported for
145
+ * consumers building custom mutators that share the same invariant. */
146
+ declare function extractH2Sections(text: string): string[];
147
+ /** Sentence-level edit distance — count distinct add/remove ops between
148
+ * two surfaces via a normalised line-by-line set diff. Treats trivial
149
+ * whitespace as identical. Exported for tests + consumer-side validators. */
150
+ declare function countSentenceEdits(baseline: string, candidate: string): number;
118
151
 
119
152
  /**
120
153
  * @experimental
@@ -414,4 +447,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
414
447
  }
415
448
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
416
449
 
417
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type OpenAutoPrResult as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, type RunImprovementLoopResult as d, type RunOptimizationOptions as e, type RunOptimizationResult as f, composeGate as g, defaultProductionGate as h, evolutionaryDriver as i, fsCampaignStorage as j, gepaDriver as k, heldOutGate as l, inMemoryCampaignStorage as m, runEval as n, openAutoPr as o, runImprovementLoop as p, runOptimization as q, runCampaign as r, surfaceHash as s };
450
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };
@@ -0,0 +1,223 @@
1
+ # Self-improvement protocol — the world-class architecture
2
+
3
+ **Status:** Strategic design. The artifact that every roadmap entry maps to.
4
+ **Date:** 2026-05-27.
5
+
6
+ ## Thesis
7
+
8
+ **Self-improvement is a protocol, not a product.** We define the wire formats, surface abstractions, driver interface, gate interface, and insight format. We ship reference implementations. Customers plug in whatever framework, model, or runtime they already use — our infrastructure handles the rigorous middle (analysis, gating, version-safe deployment).
9
+
10
+ No competitor ships this combination. LangSmith / Braintrust / Phoenix / LangFuse ship tracing. Hermes ships an agent. SkillOpt ships an academic optimizer. Anthropic's Claude Code ships skill-creation. **Nobody ships the protocol.**
11
+
12
+ ## The pipeline as a single abstract flow
13
+
14
+ ```
15
+ ┌──────────────────────────────────────────────────────────────────────┐
16
+ │ WHATEVER YOU ALREADY USE │
17
+ │ LangChain · LlamaIndex · Anthropic SDK · OpenAI Assistants · │
18
+ │ Hermes · Claude Code · Codex · agent-runtime · your own stack │
19
+ └─────────────────────────────────┬────────────────────────────────────┘
20
+ │ traces (any format)
21
+
22
+ ┌──────────────────────────────────────────────────────────────────────┐
23
+ │ INGEST — universal trace adapters │
24
+ │ fromOtelSpans · fromFeedbackTable · fromLangChain · fromLlamaIndex ·│
25
+ │ fromAnthropicSDK · fromOpenAISDK · fromHermesProfileLog · BYO │
26
+ │ → canonical RunRecord[] │
27
+ └─────────────────────────────────┬────────────────────────────────────┘
28
+
29
+ ┌──────────────────────────────────────────────────────────────────────┐
30
+ │ ANALYZE — analyzeRuns({ runs, baselineRuns?, userFeedback? }) │
31
+ │ paired-bootstrap CI · Pareto · failure clusters · prior-period │
32
+ │ delta · user-corrective-signal extraction · recommendations │
33
+ │ ← THE STATISTICAL EDGE NOBODY ELSE SHIPS │
34
+ └─────────────────────────────────┬────────────────────────────────────┘
35
+
36
+ ┌──────────────────────────────────────────────────────────────────────┐
37
+ │ IMPROVE — selfImprove() closed loop │
38
+ │ gepaDriver · evolutionaryDriver · BYO ImprovementDriver │
39
+ │ → ProfileDiff (versioned, hashed, content-addressable) │
40
+ └─────────────────────────────────┬────────────────────────────────────┘
41
+
42
+ ┌──────────────────────────────────────────────────────────────────────┐
43
+ │ GATE — defaultProductionGate (paired-CI) · BYO gate │
44
+ │ ship-substrate / ship-harness / merge / inconclusive │
45
+ │ ← STATISTICALLY STRICTER THAN ANY COMPETITOR │
46
+ └─────────────────────────────────┬────────────────────────────────────┘
47
+
48
+ ┌──────────────────────────────────────────────────────────────────────┐
49
+ │ DEPLOY — back into WHATEVER YOU ALREADY USE │
50
+ │ agent-runtime · Hermes profile log · LangChain config · custom hook │
51
+ └──────────────────────────────────────────────────────────────────────┘
52
+ ```
53
+
54
+ ## The integration promise
55
+
56
+ Customers pick one of three integration shapes. All three work today (some are aspirational on adapter coverage). Every shape uses the same canonical types underneath.
57
+
58
+ ### Shape A — offline analysis only
59
+
60
+ You have traces, you want a decision packet. Zero LLM cost. Zero closed loop.
61
+
62
+ ```typescript
63
+ import { fromOtelSpans, analyzeRuns } from '@tangle-network/agent-eval'
64
+
65
+ const runs = fromOtelSpans({ spans: mySpans })
66
+ const report = await analyzeRuns({ runs })
67
+ // → InsightReport with composite, recommendations, Pareto, ...
68
+ ```
69
+
70
+ Use case: dashboards, weekly post-mortems, "did anything regress" checks. The intelligence-kernel ships this.
71
+
72
+ ### Shape B — closed loop, your runtime
73
+
74
+ You have an agent, you want to improve it. We provide drivers + gate + insight. You decide when to deploy.
75
+
76
+ ```typescript
77
+ import { selfImprove, gepaDriver } from '@tangle-network/agent-eval'
78
+
79
+ const result = await selfImprove({
80
+ scenarios,
81
+ agent: yourAgent, // any function (surface, scenario) → artifact
82
+ judge: yourJudge, // any function (artifact) → JudgeScore
83
+ baselineSurface,
84
+ driver: gepaDriver({ llm, model, target }),
85
+ budget: { generations: 3, populationSize: 4, holdoutFraction: 0.3 },
86
+ })
87
+ // → SelfImproveResult { baselineHash, diff, winningHash, lift, gateDecision, insight }
88
+ ```
89
+
90
+ Use case: every product agent we ship. Hermes-on-our-sandbox. Claude Code with skills. Anyone wanting "ship if statistically better, else hold."
91
+
92
+ ### Shape C — hosted, cross-language
93
+
94
+ You stream traces from anywhere, get InsightReports + selfImprove orchestration. Bills usage-based.
95
+
96
+ ```sh
97
+ # Stream traces
98
+ curl https://api.tangle.tools/v1/ingest/otel \
99
+ -H "Authorization: Bearer ${TANGLE_KEY}" \
100
+ --data-binary @traces.jsonl
101
+
102
+ # Get the decision packet
103
+ curl https://api.tangle.tools/v1/insight/${runId}
104
+
105
+ # Or run a closed-loop campaign
106
+ curl https://api.tangle.tools/v1/improve \
107
+ -d '{"scenarios": ..., "baselineHash": "...", "budget": {...}}'
108
+ ```
109
+
110
+ Use case: Python customers, Go customers, customers behind firewalls, customers who don't want to operate the substrate.
111
+
112
+ ## The five non-negotiables
113
+
114
+ The protocol claim only holds if all five of these survive integration. Customers shouldn't have to compromise on any.
115
+
116
+ 1. **Universal ingest.** Any trace format → canonical RunRecord. Coverage: OTel ✓, multi-rater feedback ✓, LangChain ⏳, LlamaIndex ⏳, Anthropic SDK ⏳, OpenAI Assistants ⏳, Hermes profile log ⏳.
117
+ 2. **Statistical rigor.** Every claim falsifiable. Paired bootstrap CI on lift, Cohen's d on effect size, MDE-aware sample-size recommendations, p-values. **SkillOpt's gate is literal `cand > current`. Hermes has no gate. Ours has all of the above.** This is the moat.
118
+ 3. **Plug-in everything.** Driver, judge, gate, intake adapter, storage all swappable. Customer brings their LLM, their judge, their scenarios. We bring the rigor.
119
+ 4. **Version-safe deployment.** AgentProfile is content-addressable. Two writers (harness + substrate) can both mutate without lost-update. Gate verdicts are scoped to baseline hash, not absolute. Tracked as #98.
120
+ 5. **Cross-language wire format.** Python client at parity with TypeScript. Hosted ingest spec versioned. Customers in any language consume the same shape.
121
+
122
+ ## Where we are honest about gaps
123
+
124
+ | Component | Status | Customer impact when missing |
125
+ |---|---|---|
126
+ | `fromOtelSpans` ingest adapter | ✓ shipped 0.50.0 | — |
127
+ | `fromFeedbackTable` multi-rater intake | ✓ shipped 0.50.0 | — |
128
+ | `analyzeRuns` decision packet | ✓ shipped 0.50.0 / 0.50.2 actionability | — |
129
+ | `selfImprove` closed loop | ✓ shipped 0.50.0 | — |
130
+ | Paired-bootstrap gate | ✓ shipped early; still our edge | — |
131
+ | `gepaDriver` reflection (not full Pareto — task #101) | ⚠ partial | OK; customers don't need Pareto until plateau hit |
132
+ | **Prior-period comparison** in `analyzeRuns` | ✗ MISSING | "Did my last change help?" — the #1 customer question — has no rigorous answer today |
133
+ | **User-corrective-feedback signal extraction** | ✗ MISSING | Hermes' first-class skill signal. We have the trace data. We don't mine it. |
134
+ | **`init` CLI** scaffolding canonical eval/ layout | ✗ MISSING | Every new consumer wires it by hand; the skill describes 80 lines they have to copy |
135
+ | **Framework-specific intake adapters** (LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants) | ✗ MISSING | Customers using these frameworks can't ingest without writing custom adapter code |
136
+ | **Profile versioning** (task #98) | ✗ MISSING | Offline/online drift; gate verdicts can be stale by the time they're applied |
137
+ | **Composite driver** (optimize all surfaces against one gate) | ✗ MISSING | Customers can optimize prompts OR skills, not both jointly |
138
+ | **Empirical proof drivers work** | ✗ MISSING | We've never published "we ran gepaDriver on real customer data, here's the lift CI" |
139
+ | Hosted-tier production launch | ⚠ in scaffolding (intelligence-kernel) | Customers must self-host today |
140
+
141
+ ## The roadmap — what closes each gap
142
+
143
+ Mapping every roadmap entry back to a concrete protocol gap.
144
+
145
+ ### 0.53.0 (this session-or-next) — answer "did my last change help?"
146
+
147
+ - **`analyzeRuns({ runs, baselineRuns? })`** — when `baselineRuns` is provided, the report includes a `priorPeriodComparison?` block: per-metric delta with paired-bootstrap CI, MDE-aware significance judgment, "regressed metrics" surfaced in `recommendations`.
148
+ - Built on top of existing `diffRuns()` primitive (already shipped 0.48.0).
149
+ - 1 PR. Pure additive surface.
150
+ - **Customer impact**: this is the conversion question for every prospect.
151
+
152
+ ### 0.54.0 — extract Hermes' missing signal
153
+
154
+ - **`extractUserCorrections(runs)`** — new substrate primitive. Mines user messages in traces for corrective markers (regex pass + LLM classifier for nuance). Returns `UserCorrectionEvent[]` keyed by runId.
155
+ - `analyzeRuns({ runs, userFeedback? })` includes a "common corrections" cluster in `recommendations`.
156
+ - Bridge to Hermes-style signal without adopting Hermes' runtime.
157
+ - **Customer impact**: distinctive — no competitor mines this signal.
158
+
159
+ ### 0.55.0 — framework-specific intake adapters
160
+
161
+ - **`fromLangChain(traces)`**, **`fromLlamaIndex(traces)`**, **`fromAnthropicSDK(traces)`**, **`fromOpenAIAssistants(traces)`**.
162
+ - Each maps the framework's native trace shape to RunRecord.
163
+ - Top 4 frameworks = 80% of agent-builder market coverage.
164
+ - **Customer impact**: removes "we don't support your framework" friction.
165
+
166
+ ### 0.56.0 — `init` CLI + worked examples
167
+
168
+ - `pnpm dlx @tangle-network/agent-eval init` scaffolds the canonical `eval/scenarios.json` + 3 pnpm scripts + judges template + `.runs/` directory.
169
+ - Adds 5+ end-to-end runnable examples covering Shapes A/B/C across the 4 framework adapters.
170
+ - **Customer impact**: time-to-first-eval drops from 4 hours to 5 minutes.
171
+
172
+ ### 1.0.0 — profile versioning (#98) + composite driver
173
+
174
+ - Content-addressable `AgentProfileVersion` + `ProfileDiff` + 3-way merge + 4-way `DriftGateDecision`.
175
+ - `compositeDriver` — optimize all surfaces of one AgentProfile against one gate.
176
+ - Hermes-on-sandbox forcing function validates the work before commit.
177
+ - **Customer impact**: production-safe; the moat is locked.
178
+
179
+ ### 1.1.0 — empirical-proof publication
180
+
181
+ - Pick one named customer or one synthetic-realistic corpus (legal-agent canonical).
182
+ - Run gepaDriver end-to-end with real LLM cost.
183
+ - Publish: "n=, lift=, CI=, p=, $cost=, vs no-driver baseline."
184
+ - One blog post, one demo video, one runnable repro.
185
+ - **Customer impact**: every other claim becomes credible because this one is verified.
186
+
187
+ ## Why this design is 100x
188
+
189
+ Not a 10% improvement over LangSmith. A category change.
190
+
191
+ | Capability | LangSmith / Braintrust / Phoenix | Hermes / Claude Code | Tangle (target) |
192
+ |---|---|---|---|
193
+ | Trace ingest | ✓ proprietary | ✓ own runtime | ✓ universal |
194
+ | Decision packet | ⚠ scorecards (no CI) | ✗ | ✓ paired-bootstrap |
195
+ | Closed loop | ✗ | ✓ heuristic | ✓ statistically rigorous |
196
+ | Plug-in drivers | ✗ | ✗ | ✓ |
197
+ | Profile versioning | ✗ | ✗ | ✓ (1.0.0) |
198
+ | Composite multi-surface | ✗ | ✗ | ✓ (1.0.0) |
199
+ | Cross-language | ✗ | ✗ | ✓ (Python at parity) |
200
+ | Empirical-proof publication | ✗ | ✗ | ✓ (1.1.0) |
201
+
202
+ Eight rows. Nobody else has eight. We can be the only one. The work is named, scoped, and queued.
203
+
204
+ ## What's NOT on the roadmap (and why)
205
+
206
+ - **Building our own agent runtime.** Hermes / agent-runtime / Claude Code cover that. We are infrastructure, not a runtime.
207
+ - **Single-vendor LLM.** Substrate stays model-agnostic.
208
+ - **UI-first product.** API-first. UIs are downstream.
209
+ - **LangChain replacement.** Wrong layer.
210
+ - **"Self-improvement" without a held-out gate.** Hermes and SkillOpt both ship this; we explicitly refuse — every selfImprove() requires a holdout.
211
+
212
+ ## Decision log — what we committed to in 0.52.0 → 1.0.0
213
+
214
+ 1. **`skillOptDriver` removed; behavior in `gepaDriver({ constraints })`** — 0.52.0 ✓ shipped
215
+ 2. **Honest spec docs** — 0.52.0 ✓ shipped
216
+ 3. **Profile-versioning spec with symmetric-fork framing** — 0.52.0 ✓ shipped
217
+ 4. **No V2 names anywhere** — enforced
218
+ 5. **Forcing-function gate on profile-versioning work** — Hermes-on-sandbox experiment required before phases 1-5 commit
219
+ 6. **Single-PR-per-repo discipline** — enforced 0.52.0 onwards
220
+ 7. **Prior-period comparison as 0.53.0** — committed; the customer-conversion primitive
221
+ 8. **User-feedback extraction as 0.54.0** — committed; the Hermes-signal bridge
222
+ 9. **Framework intake adapters as 0.55.0** — committed; 80% market coverage
223
+ 10. **Empirical-proof publication as 1.1.0** — committed; the credibility lock
@@ -0,0 +1,251 @@
1
+ # Driver Honest Spec — what each driver IS, what each methodology actually is, where we deviate
2
+
3
+ **Status:** Living document. Updated when we learn the truth from primary sources.
4
+ **Date:** 2026-05-27
5
+
6
+ This document exists because the project shipped two drivers with methodology names attached (`gepaDriver`, `skillOptDriver`) without the methodology specs being precisely encoded anywhere in the repo. That created an integrity gap. This doc closes it.
7
+
8
+ Every claim in this doc is sourced from a primary reference (paper, code, or directly verifiable from our source). Marketing language is forbidden. If something is not implemented we say so.
9
+
10
+ ---
11
+
12
+ ## Part 1 — GEPA (the paper)
13
+
14
+ **Source**: Agrawal et al., *"GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning"*, arXiv:2507.19457, July 2025.
15
+
16
+ ### What GEPA actually does
17
+
18
+ Outer loop (verbatim from abstract): "samples trajectories (e.g., reasoning, tool calls, and tool outputs) and reflects on them in natural language to diagnose problems, propose and test prompt updates, and combine complementary lessons from the **Pareto frontier of its own attempts**."
19
+
20
+ Named primitives in the paper:
21
+ - **GEPA** (Genetic-Pareto) — the overall optimizer
22
+ - **Pareto frontier** — non-dominated candidate set retained across iterations
23
+ - **Prompt updates** — mutations proposed by reflection
24
+ - **Rollouts** — trajectory samples
25
+
26
+ ### What gepaDriver in our substrate ACTUALLY does
27
+
28
+ Source: `src/campaign/drivers/gepa.ts` (132 lines)
29
+
30
+ - Single LLM call per `propose()` invocation
31
+ - Input: prior generation's **single best candidate by composite score** + that candidate's top/bottom scenarios + 3 weakest dimensions (`buildEvidence`)
32
+ - Output: N proposals, each a full document rewrite
33
+ - Dedup by exact text equality
34
+
35
+ ### Deviations from the GEPA paper
36
+
37
+ | GEPA paper | Our `gepaDriver` |
38
+ |---|---|
39
+ | **Pareto frontier** of candidates | **Single "best by composite"** — no Pareto set, no non-dominated tracking |
40
+ | **Combine complementary lessons** from frontier | Each generation reflects on ONE prior candidate; no combination |
41
+ | Multi-objective optimization | Single-objective (composite score) |
42
+ | Genetic operators (mutation, crossover) | Reflection only — no crossover |
43
+ | Sample efficiency claim (35× fewer rollouts than GRPO) | Unmeasured against any baseline |
44
+
45
+ **Honest assessment**: our `gepaDriver` is a **reflective full-rewrite driver**, not GEPA. It captures GEPA's *reflection* primitive but not its *Pareto* mechanism. The name oversells. A faithful renaming would be `reflectiveRewriteDriver`. A faithful implementation would add a Pareto candidate pool + combine step.
46
+
47
+ ---
48
+
49
+ ## Part 2 — SkillOpt (the paper + code)
50
+
51
+ **Source**:
52
+ - README: https://github.com/microsoft/SkillOpt
53
+ - Source: `/tmp/SkillOpt/skillopt/` (cloned 2026-05-27)
54
+ - Key files: `engine/trainer.py`, `optimizer/clip.py` (rank_and_select), `optimizer/update_modes.py`, `evaluation/gate.py`, `types.py`
55
+
56
+ ### What SkillOpt actually does
57
+
58
+ **6-stage per-step pipeline** (verbatim from `trainer.py:516` and adjacent):
59
+
60
+ 1. **Rollout** — `adapter.rollout(train_env, current_skill, ...)` collects trajectories on a batch.
61
+ 2. **Reflect** — `adapter.reflect()` analyses trajectories and emits **structured patches** (NOT full rewrites in patch mode). Failure trials → failure patches; success trials → success patches.
62
+ 3. **Aggregate** — `merge_patches(current_skill, all_failure_patches, all_success_patches, batch_size=merge_bs)` — hierarchically merges patches across accumulated batches.
63
+ 4. **Select** — `rank_and_select(current_skill, merged_patch, max_edits=edit_budget)` — if edit pool > budget, calls an optimizer LLM to **rank edits by importance** and keep top-L. Budget is "analogous to gradient clipping" (their words).
64
+ 5. **Update** — apply patch in one of 3 modes:
65
+ - **`patch`** — deterministic diff apply via `apply_patch_with_report()`; ops are `append | insert_after | replace | delete`
66
+ - **`rewrite_from_suggestions`** — LLM regenerates full skill from suggestions
67
+ - **`full_rewrite_minibatch`** — reflection directly emits complete candidate skills; select picks the best
68
+ 6. **Evaluate & Gate** — runs candidate on selection set, calls `evaluate_gate(cand_hard, current_score, best_score)`. Returns `accept_new_best | accept | reject` from a **literal `cand_hard > current_score`** comparison (`evaluation/gate.py:38`). No statistical test.
69
+
70
+ Plus epoch-level stages:
71
+ - **Slow update** — `run_slow_update()` builds longitudinal pairs across epochs.
72
+ - **Meta skill** — `run_meta_skill()` produces optimizer-side memory of patterns across adjacent epochs.
73
+
74
+ ### Canonical patch shape (from `types.py:22-45`)
75
+
76
+ ```python
77
+ EditOp = Literal["append", "insert_after", "replace", "delete"]
78
+
79
+ @dataclass
80
+ class Edit:
81
+ op: EditOp
82
+ content: str
83
+ target: str # for replace/delete/insert_after
84
+ support_count: int | None # how many trials voted for this edit
85
+ source_type: Literal["failure", "success"] | None
86
+ merge_level: int | None
87
+
88
+ @dataclass
89
+ class Patch:
90
+ edits: list[Edit]
91
+ reasoning: str
92
+ ranking_details: dict | None
93
+ ```
94
+
95
+ ### What `skillOptDriver` v0.51.0 in our substrate ACTUALLY does
96
+
97
+ Source: `src/campaign/drivers/skillopt.ts` (current as of 0.51.0)
98
+
99
+ - Single LLM call per `propose()` returning N full document rewrites
100
+ - Post-parse rejection on: (a) any H2 header dropped, (b) sentence-edit count > editBudget × 2
101
+ - Substantively equivalent to `gepaDriver` + 2 validation constraints
102
+
103
+ ### Deviations from SkillOpt
104
+
105
+ | SkillOpt actual | Our 0.51.0 `skillOptDriver` |
106
+ |---|---|
107
+ | 6-stage pipeline (rollout → reflect → aggregate → select → update → gate) | Single LLM call → N rewrites |
108
+ | **Patch-based edits** (`{op, target, content, support_count, source_type}`) | Full document rewrites only |
109
+ | `merge_patches()` hierarchical merge across batches | No aggregation; each `propose()` is independent |
110
+ | `rank_and_select(max_edits=edit_budget)` LLM-ranking of edits | All candidates that pass validation are returned |
111
+ | 3 update modes (`patch`, `rewrite_from_suggestions`, `full_rewrite_minibatch`) | Only `full_rewrite_minibatch`-equivalent |
112
+ | `evaluate_gate()` with `accept_new_best/accept/reject` codes | Substrate's outer gate decides ship/hold/inspect; driver doesn't see fine-grained accept signal |
113
+ | Longitudinal `slow_update` across epochs | Not implemented |
114
+ | `meta_skill` optimizer-side memory | Not implemented |
115
+ | Selection-set cache (`sel_cache`) for repeated candidate hashes | Not implemented |
116
+ | Edit-budget LR scheduler (constant / linear / cosine / autonomous) | Single fixed `editBudget` |
117
+ | Mini-batch accumulation (`steps_per_epoch`, `merge_batch_size`) | Not implemented |
118
+ | `decide_autonomous_learning_rate()` | Not implemented |
119
+ | `longitudinal_pair_policy` (mixed / changed / unchanged) | Not implemented |
120
+
121
+ **Honest assessment**: 13 substantive deviations. `skillOptDriver` 0.51.0 is **not** SkillOpt. It is `gepaDriver` with two post-validation constraints (section preservation, sentence-edit count). The methodology name oversells the implementation.
122
+
123
+ ### One thing where we are STRICTER than SkillOpt
124
+
125
+ **The gate.** SkillOpt: literal `cand_hard > current_score` (`evaluation/gate.py:38`). Our substrate: paired bootstrap + 95% CI + Cohen's d + MDE + p-value (`defaultProductionGate`). When the lift CI straddles zero, our gate returns `hold` / `inspect`. SkillOpt would accept any improvement at all, even single-sample noise.
126
+
127
+ This is real differentiation we have not been crediting ourselves for.
128
+
129
+ ---
130
+
131
+ ## Part 3 — Hermes Agent's "self-improvement"
132
+
133
+ **Source**: `/tmp/hermes-agent/` (cloned 2026-05-27)
134
+ - `agent/curator.py` (the actual loop)
135
+ - `agent/skill_commands.py`
136
+ - `agent/skill_utils.py`
137
+
138
+ ### What Hermes actually does
139
+
140
+ From `curator.py` line 1: "Curator — background skill maintenance orchestrator. The curator is an auxiliary-model task that periodically reviews agent-created skills and maintains the collection."
141
+
142
+ Trigger: idle-driven, with default `DEFAULT_INTERVAL_HOURS = 24 * 7` (7 days). When the agent has been idle for `DEFAULT_MIN_IDLE_HOURS = 2` and the last curator run was > 7 days ago, `maybe_run_curator()` spawns a forked AIAgent.
143
+
144
+ What the curator does:
145
+ - "Auto-transition lifecycle states based on derived skill activity timestamps"
146
+ - "Spawn a background review agent that can **pin / archive / consolidate / patch** agent-created skills via `skill_manage`"
147
+ - "Persist curator state (last_run_at, paused, etc.) in `.curator_state`"
148
+
149
+ Strict invariants:
150
+ - Only touches agent-created skills
151
+ - "Never auto-deletes — only archives"
152
+ - Pinned skills bypass auto-transitions
153
+ - Uses the auxiliary client (separate from main session)
154
+
155
+ ### Hermes' actual gate
156
+
157
+ **There is none.** The curator is an LLM editor making editorial decisions. There is no:
158
+ - Held-out validation
159
+ - Performance comparison between old and new skill versions
160
+ - Statistical test
161
+ - Rejection-on-regression mechanism
162
+
163
+ Skills are refined by an LLM looking at usage patterns; the refinement is accepted because the LLM proposed it.
164
+
165
+ ### Honest assessment
166
+
167
+ Hermes has a **skill curation system**, not a self-improvement loop. The README's claim "the only agent with a built-in learning loop" is generous — it's a 7-day-cron LLM librarian. There's no measurable guarantee that today's curated skill collection performs better than yesterday's.
168
+
169
+ Compare:
170
+ | Component | Hermes | SkillOpt | Tangle |
171
+ |---|---|---|---|
172
+ | Validation gate | None | `>` | Paired bootstrap CI |
173
+ | Patch-level edits | No (LLM rewrites whole skill) | Yes | No (full rewrite only) |
174
+ | Skill ranking / selection | No | Yes | No |
175
+ | Sample efficiency claim | None | 35× vs GRPO | None |
176
+ | Frequency | 7-day cron | Per training step | Per `selfImprove()` call |
177
+
178
+ Where Tangle WINS: the gate. Where SkillOpt WINS: the pipeline sophistication. Where Hermes WINS: the deployment story (multi-platform, multi-tool-backend).
179
+
180
+ ---
181
+
182
+ ## Part 4 — What we should actually do
183
+
184
+ ### Phase A — rename to honest names (0.51.1, this session)
185
+
186
+ The current `skillOptDriver` and `gepaDriver` names overclaim. Options:
187
+
188
+ 1. **Rename both:**
189
+ - `gepaDriver` → `reflectiveRewriteDriver` (drops the "Pareto" implication)
190
+ - `skillOptDriver` → `constrainedReflectiveDriver` (drops the SkillOpt-methodology implication)
191
+ - Reserve `gepaDriver` + `skillOptDriver` for faithful implementations
192
+ 2. **Keep `gepaDriver` name** (it's our most-used driver; renaming is disruptive); rename `skillOptDriver`.
193
+ 3. **Keep both names; add `@experimental` + a "differs from paper" docstring section.** Cheapest. Truthful enough.
194
+
195
+ Recommendation: **option 3 plus a frontmatter "deviations from paper" section** in each driver source file. Empirically test before renaming.
196
+
197
+ ### Phase B — build the honest empirical harness (0.51.1, this session)
198
+
199
+ `tests/driver-empirical.bench.ts` — for each driver:
200
+ - Same scenarios (5 synthetic + 5 real legal-agent scenarios)
201
+ - Same judge
202
+ - Same `baselineSurface`
203
+ - Same `budget` (1 gen, 3 candidates, holdout 0.3)
204
+ - Report: lift mean, lift CI95, p-value, rollouts spent, $$ spent
205
+
206
+ Drivers in the matrix:
207
+ - `gepaDriver` (current full-rewrite reflection)
208
+ - `skillOptDriver` (current 0.51.0 full-rewrite + constraints)
209
+ - Future: real `skillOptDriverV2` with patch mode
210
+
211
+ This is the **falsifiable test** of whether our drivers' methodology claims are worth the names.
212
+
213
+ ### Phase C — implement SkillOpt patch mode for real (0.52.0)
214
+
215
+ Build `skillOptDriverV2` with:
216
+ 1. **`Edit` type matching SkillOpt's**: `{op: 'append'|'insert_after'|'replace'|'delete', content, target?, support_count?, source_type?}`
217
+ 2. **Reflect step emits patches**, not full rewrites
218
+ 3. **`mergePatches()`** — LLM-driven hierarchical merge of failure + success patches
219
+ 4. **`rankAndSelect()`** — LLM-driven ranking when edit pool > budget
220
+ 5. **Deterministic `applyPatch()`** — string ops, no LLM
221
+ 6. **Keep our gate** (paired bootstrap CI). Don't downgrade to SkillOpt's `>` — that's our edge.
222
+
223
+ Estimated scope: 400-600 lines + tests.
224
+
225
+ ### Phase D — implement GEPA's Pareto frontier (0.53.0)
226
+
227
+ Build `gepaDriverV2` with:
228
+ 1. **Candidate pool** retained across generations (non-dominated)
229
+ 2. **Multi-objective evaluation** (composite + cost + length + diversity)
230
+ 3. **Combine step** — LLM combines lessons from non-dominated candidates
231
+ 4. Keep reflection.
232
+ 5. Sample-efficiency target: match the paper's ~35× claim on a benchmark we choose.
233
+
234
+ Estimated scope: 500-800 lines + tests.
235
+
236
+ ---
237
+
238
+ ## Source pointers (audit trail)
239
+
240
+ - GEPA paper: https://arxiv.org/abs/2507.19457
241
+ - SkillOpt repo: https://github.com/microsoft/SkillOpt (cloned at `/tmp/SkillOpt/` 2026-05-27)
242
+ - Hermes repo: https://github.com/NousResearch/hermes-agent (cloned at `/tmp/hermes-agent/` 2026-05-27)
243
+ - Our gepaDriver: `src/campaign/drivers/gepa.ts`
244
+ - Our skillOptDriver: `src/campaign/drivers/skillopt.ts`
245
+ - Our gate: `src/campaign/gates/default-production-gate.ts`
246
+ - Our reflection primitive: `src/reflective-mutation.ts`
247
+
248
+ Update this doc when:
249
+ - We discover new behavior in any of the upstream methods (via reading their code, not their READMEs)
250
+ - We ship a driver that closes one of the named gaps
251
+ - We run the empirical harness and have real numbers to add