lm-deluge 0.0.67__py3-none-any.whl → 0.0.88__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- lm_deluge/__init__.py +25 -2
- lm_deluge/api_requests/anthropic.py +92 -17
- lm_deluge/api_requests/base.py +47 -11
- lm_deluge/api_requests/bedrock.py +7 -4
- lm_deluge/api_requests/chat_reasoning.py +4 -0
- lm_deluge/api_requests/gemini.py +138 -18
- lm_deluge/api_requests/openai.py +114 -21
- lm_deluge/client.py +282 -49
- lm_deluge/config.py +15 -3
- lm_deluge/mock_openai.py +643 -0
- lm_deluge/models/__init__.py +12 -1
- lm_deluge/models/anthropic.py +17 -2
- lm_deluge/models/arcee.py +16 -0
- lm_deluge/models/deepseek.py +36 -4
- lm_deluge/models/google.py +29 -0
- lm_deluge/models/grok.py +24 -0
- lm_deluge/models/kimi.py +36 -0
- lm_deluge/models/minimax.py +10 -0
- lm_deluge/models/openai.py +100 -0
- lm_deluge/models/openrouter.py +86 -8
- lm_deluge/models/together.py +11 -0
- lm_deluge/models/zai.py +1 -0
- lm_deluge/pipelines/gepa/__init__.py +95 -0
- lm_deluge/pipelines/gepa/core.py +354 -0
- lm_deluge/pipelines/gepa/docs/samples.py +696 -0
- lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
- lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
- lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
- lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
- lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
- lm_deluge/pipelines/gepa/optimizer.py +435 -0
- lm_deluge/pipelines/gepa/proposer.py +235 -0
- lm_deluge/pipelines/gepa/util.py +165 -0
- lm_deluge/{llm_tools → pipelines}/score.py +2 -2
- lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
- lm_deluge/prompt.py +224 -40
- lm_deluge/request_context.py +7 -2
- lm_deluge/tool/__init__.py +1118 -0
- lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
- lm_deluge/tool/builtin/gemini.py +59 -0
- lm_deluge/tool/builtin/openai.py +74 -0
- lm_deluge/tool/cua/__init__.py +173 -0
- lm_deluge/tool/cua/actions.py +148 -0
- lm_deluge/tool/cua/base.py +27 -0
- lm_deluge/tool/cua/batch.py +215 -0
- lm_deluge/tool/cua/converters.py +466 -0
- lm_deluge/tool/cua/kernel.py +702 -0
- lm_deluge/tool/cua/trycua.py +989 -0
- lm_deluge/tool/prefab/__init__.py +45 -0
- lm_deluge/tool/prefab/batch_tool.py +156 -0
- lm_deluge/tool/prefab/docs.py +1119 -0
- lm_deluge/tool/prefab/email.py +294 -0
- lm_deluge/tool/prefab/filesystem.py +1711 -0
- lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
- lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
- lm_deluge/tool/prefab/memory.py +458 -0
- lm_deluge/tool/prefab/otc/__init__.py +165 -0
- lm_deluge/tool/prefab/otc/executor.py +281 -0
- lm_deluge/tool/prefab/otc/parse.py +188 -0
- lm_deluge/tool/prefab/random.py +212 -0
- lm_deluge/tool/prefab/rlm/__init__.py +296 -0
- lm_deluge/tool/prefab/rlm/executor.py +349 -0
- lm_deluge/tool/prefab/rlm/parse.py +144 -0
- lm_deluge/tool/prefab/sandbox.py +1621 -0
- lm_deluge/tool/prefab/sheets.py +385 -0
- lm_deluge/tool/prefab/subagents.py +233 -0
- lm_deluge/tool/prefab/todos.py +342 -0
- lm_deluge/tool/prefab/tool_search.py +169 -0
- lm_deluge/tool/prefab/web_search.py +199 -0
- lm_deluge/tracker.py +16 -13
- lm_deluge/util/schema.py +412 -0
- lm_deluge/warnings.py +8 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/METADATA +22 -9
- lm_deluge-0.0.88.dist-info/RECORD +117 -0
- lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
- lm_deluge/built_in_tools/openai.py +0 -28
- lm_deluge/presets/cerebras.py +0 -17
- lm_deluge/presets/meta.py +0 -13
- lm_deluge/tool.py +0 -849
- lm_deluge-0.0.67.dist-info/RECORD +0 -72
- lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
- /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/bash.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/computer_use.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.88.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GEPA optimizer.
|
|
3
|
+
|
|
4
|
+
Main optimization loop that evolves text components using trajectory-based feedback.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import random
|
|
12
|
+
from collections.abc import Awaitable, Callable
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Generic, TypeVar
|
|
15
|
+
|
|
16
|
+
from lm_deluge.client import _LLMClient
|
|
17
|
+
from lm_deluge.pipelines.gepa.core import (
|
|
18
|
+
Component,
|
|
19
|
+
EvalResult,
|
|
20
|
+
GEPAResult,
|
|
21
|
+
GEPAState,
|
|
22
|
+
)
|
|
23
|
+
from lm_deluge.pipelines.gepa.proposer import propose_improvement
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T") # Example type
|
|
26
|
+
|
|
27
|
+
# Type for the user-provided evaluate function (must be async)
|
|
28
|
+
AsyncEvaluateFn = Callable[[_LLMClient, dict[str, str], T], Awaitable[EvalResult]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def optimize(
|
|
32
|
+
components: dict[str, Component],
|
|
33
|
+
evaluate_fn: AsyncEvaluateFn[T],
|
|
34
|
+
dataset: list[T],
|
|
35
|
+
task_client: _LLMClient,
|
|
36
|
+
proposer_client: _LLMClient,
|
|
37
|
+
*,
|
|
38
|
+
val_dataset: list[T] | None = None,
|
|
39
|
+
max_iterations: int = 100,
|
|
40
|
+
max_evals: int | None = None,
|
|
41
|
+
minibatch_size: int = 4,
|
|
42
|
+
perfect_score: float = 1.0,
|
|
43
|
+
proposal_prompt_template: str | None = None,
|
|
44
|
+
meta_instructions: str | None = None,
|
|
45
|
+
run_dir: str | Path | None = None,
|
|
46
|
+
log_fn: Callable[[str], None] | None = None,
|
|
47
|
+
save_trajectories: bool = False,
|
|
48
|
+
seed: int = 0,
|
|
49
|
+
) -> GEPAResult:
|
|
50
|
+
"""
|
|
51
|
+
Run GEPA optimization to improve text components.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
components: The text components to optimize (name -> Component)
|
|
55
|
+
evaluate_fn: Async function that evaluates one example: (client, values, example) -> EvalResult
|
|
56
|
+
dataset: Training examples
|
|
57
|
+
task_client: LLMClient for running evaluations
|
|
58
|
+
proposer_client: LLMClient for generating proposals
|
|
59
|
+
val_dataset: Optional separate validation set (defaults to dataset)
|
|
60
|
+
max_iterations: Maximum optimization iterations
|
|
61
|
+
max_evals: Maximum total evaluations (budget)
|
|
62
|
+
minibatch_size: Examples per minibatch for proposal evaluation
|
|
63
|
+
perfect_score: Score considered perfect (skip examples that achieve this)
|
|
64
|
+
proposal_prompt_template: Custom prompt template for proposer
|
|
65
|
+
meta_instructions: Guidelines to steer the proposer (e.g., "Don't overfit to specific examples")
|
|
66
|
+
run_dir: Directory to save state and trajectories
|
|
67
|
+
log_fn: Logging function (defaults to print)
|
|
68
|
+
save_trajectories: Whether to save trajectories to disk
|
|
69
|
+
seed: Random seed for reproducibility
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
GEPAResult with optimization results
|
|
73
|
+
"""
|
|
74
|
+
engine = GEPAEngine(
|
|
75
|
+
components=components,
|
|
76
|
+
evaluate_fn=evaluate_fn,
|
|
77
|
+
dataset=dataset,
|
|
78
|
+
task_client=task_client,
|
|
79
|
+
proposer_client=proposer_client,
|
|
80
|
+
val_dataset=val_dataset,
|
|
81
|
+
max_iterations=max_iterations,
|
|
82
|
+
max_evals=max_evals,
|
|
83
|
+
minibatch_size=minibatch_size,
|
|
84
|
+
perfect_score=perfect_score,
|
|
85
|
+
proposal_prompt_template=proposal_prompt_template,
|
|
86
|
+
meta_instructions=meta_instructions,
|
|
87
|
+
run_dir=run_dir,
|
|
88
|
+
log_fn=log_fn,
|
|
89
|
+
save_trajectories=save_trajectories,
|
|
90
|
+
seed=seed,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return asyncio.run(engine.run())
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class GEPAEngine(Generic[T]):
|
|
97
|
+
"""
|
|
98
|
+
Stateful GEPA optimizer.
|
|
99
|
+
|
|
100
|
+
Use this for more control over the optimization process:
|
|
101
|
+
- Resume from saved state
|
|
102
|
+
- Step through iterations manually
|
|
103
|
+
- Access intermediate state
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
components: dict[str, Component],
|
|
109
|
+
evaluate_fn: AsyncEvaluateFn[T],
|
|
110
|
+
dataset: list[T],
|
|
111
|
+
task_client: _LLMClient,
|
|
112
|
+
proposer_client: _LLMClient,
|
|
113
|
+
*,
|
|
114
|
+
val_dataset: list[T] | None = None,
|
|
115
|
+
max_iterations: int = 100,
|
|
116
|
+
max_evals: int | None = None,
|
|
117
|
+
minibatch_size: int = 4,
|
|
118
|
+
perfect_score: float = 1.0,
|
|
119
|
+
proposal_prompt_template: str | None = None,
|
|
120
|
+
meta_instructions: str | None = None,
|
|
121
|
+
run_dir: str | Path | None = None,
|
|
122
|
+
log_fn: Callable[[str], None] | None = None,
|
|
123
|
+
save_trajectories: bool = False,
|
|
124
|
+
seed: int = 0,
|
|
125
|
+
):
|
|
126
|
+
self.components = components
|
|
127
|
+
self.evaluate_fn = evaluate_fn
|
|
128
|
+
self.dataset = dataset
|
|
129
|
+
self.task_client = task_client
|
|
130
|
+
self.proposer_client = proposer_client
|
|
131
|
+
self.val_dataset = val_dataset if val_dataset is not None else dataset
|
|
132
|
+
self.max_iterations = max_iterations
|
|
133
|
+
self.max_evals = max_evals
|
|
134
|
+
self.minibatch_size = minibatch_size
|
|
135
|
+
self.perfect_score = perfect_score
|
|
136
|
+
self.proposal_prompt_template = proposal_prompt_template
|
|
137
|
+
self.meta_instructions = meta_instructions
|
|
138
|
+
self.run_dir = Path(run_dir) if run_dir else None
|
|
139
|
+
self.log_fn = log_fn or print
|
|
140
|
+
self.save_trajectories = save_trajectories
|
|
141
|
+
|
|
142
|
+
self.rng = random.Random(seed)
|
|
143
|
+
self.state: GEPAState | None = None
|
|
144
|
+
self._trajectory_counter = 0
|
|
145
|
+
|
|
146
|
+
def _log(self, msg: str) -> None:
|
|
147
|
+
self.log_fn(msg)
|
|
148
|
+
|
|
149
|
+
async def _evaluate_batch(
|
|
150
|
+
self,
|
|
151
|
+
examples: list[tuple[int, T]], # (index, example) pairs
|
|
152
|
+
component_values: dict[str, str],
|
|
153
|
+
) -> list[tuple[int, EvalResult]]:
|
|
154
|
+
"""Evaluate a batch of examples concurrently, return (index, result) pairs."""
|
|
155
|
+
|
|
156
|
+
async def eval_one(idx: int, example: T) -> tuple[int, EvalResult] | None:
|
|
157
|
+
try:
|
|
158
|
+
result = await self.evaluate_fn(
|
|
159
|
+
self.task_client, component_values, example
|
|
160
|
+
)
|
|
161
|
+
return (idx, result)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
self._log(f"Error evaluating example {idx}: {e}")
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
# Run all evaluations concurrently
|
|
167
|
+
tasks = [eval_one(idx, example) for idx, example in examples]
|
|
168
|
+
results_raw = await asyncio.gather(*tasks)
|
|
169
|
+
|
|
170
|
+
# Filter out None results (failed evaluations)
|
|
171
|
+
results = [r for r in results_raw if r is not None]
|
|
172
|
+
return results
|
|
173
|
+
|
|
174
|
+
async def _evaluate_all(
|
|
175
|
+
self,
|
|
176
|
+
examples: list[T],
|
|
177
|
+
component_values: dict[str, str],
|
|
178
|
+
) -> dict[int, float]:
|
|
179
|
+
"""Evaluate all examples, return scores dict."""
|
|
180
|
+
indexed = [(i, ex) for i, ex in enumerate(examples)]
|
|
181
|
+
results = await self._evaluate_batch(indexed, component_values)
|
|
182
|
+
|
|
183
|
+
if self.state:
|
|
184
|
+
self.state.total_evals += len(results)
|
|
185
|
+
|
|
186
|
+
return {idx: result.score for idx, result in results}
|
|
187
|
+
|
|
188
|
+
def _save_trajectory(
|
|
189
|
+
self,
|
|
190
|
+
iteration: int,
|
|
191
|
+
tag: str,
|
|
192
|
+
candidate_values: dict[str, str],
|
|
193
|
+
example_idx: int,
|
|
194
|
+
result: EvalResult,
|
|
195
|
+
) -> None:
|
|
196
|
+
"""Save a trajectory to disk for debugging."""
|
|
197
|
+
if not self.save_trajectories or not self.run_dir:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
traj_dir = self.run_dir / "trajectories"
|
|
201
|
+
traj_dir.mkdir(parents=True, exist_ok=True)
|
|
202
|
+
|
|
203
|
+
self._trajectory_counter += 1
|
|
204
|
+
filename = f"{self._trajectory_counter:04d}_iter{iteration}_{tag}.json"
|
|
205
|
+
|
|
206
|
+
data = {
|
|
207
|
+
"iteration": iteration,
|
|
208
|
+
"tag": tag,
|
|
209
|
+
"example_idx": example_idx,
|
|
210
|
+
"candidate": candidate_values,
|
|
211
|
+
"score": result.score,
|
|
212
|
+
"feedback": result.feedback,
|
|
213
|
+
"conversation": result.conversation.to_log(),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
with open(traj_dir / filename, "w") as f:
|
|
218
|
+
json.dump(data, f, indent=2, default=str)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
self._log(f"Failed to save trajectory: {e}")
|
|
221
|
+
|
|
222
|
+
async def initialize(self) -> None:
|
|
223
|
+
"""Initialize state by evaluating seed candidate on validation set."""
|
|
224
|
+
self._log("Evaluating seed candidate...")
|
|
225
|
+
|
|
226
|
+
seed_values = {name: comp.value for name, comp in self.components.items()}
|
|
227
|
+
seed_scores = await self._evaluate_all(self.val_dataset, seed_values)
|
|
228
|
+
|
|
229
|
+
self.state = GEPAState.initialize(self.components, seed_scores)
|
|
230
|
+
|
|
231
|
+
avg_score = sum(seed_scores.values()) / len(seed_scores) if seed_scores else 0.0
|
|
232
|
+
self._log(
|
|
233
|
+
f"Seed candidate: avg_score={avg_score:.4f} on {len(seed_scores)} examples"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def _should_stop(self) -> bool:
|
|
237
|
+
"""Check if we should stop optimization."""
|
|
238
|
+
if self.state is None:
|
|
239
|
+
return True
|
|
240
|
+
|
|
241
|
+
if self.state.iteration >= self.max_iterations:
|
|
242
|
+
return True
|
|
243
|
+
|
|
244
|
+
if self.max_evals and self.state.total_evals >= self.max_evals:
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
# Stop if best candidate achieves perfect score on val set
|
|
248
|
+
best_idx = self.state.best_candidate_idx()
|
|
249
|
+
best_avg = self.state.get_candidate_avg_score(best_idx)
|
|
250
|
+
if best_avg >= self.perfect_score:
|
|
251
|
+
self._log(
|
|
252
|
+
f"Best candidate achieved perfect score ({best_avg:.4f}), stopping"
|
|
253
|
+
)
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
async def step(self) -> bool:
|
|
259
|
+
"""
|
|
260
|
+
Run one optimization iteration.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
True if optimization should continue, False if done
|
|
264
|
+
"""
|
|
265
|
+
if self.state is None:
|
|
266
|
+
await self.initialize()
|
|
267
|
+
|
|
268
|
+
assert self.state is not None
|
|
269
|
+
|
|
270
|
+
if self._should_stop():
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
self.state.iteration += 1
|
|
274
|
+
iteration = self.state.iteration
|
|
275
|
+
|
|
276
|
+
# Get current best candidate
|
|
277
|
+
best_idx = self.state.best_candidate_idx()
|
|
278
|
+
current_values = self.state.candidates[best_idx]
|
|
279
|
+
|
|
280
|
+
# Find examples where the BEST candidate isn't perfect
|
|
281
|
+
# (not Pareto front - we want to improve the best single candidate)
|
|
282
|
+
best_scores = self.state.candidate_scores[best_idx]
|
|
283
|
+
improvable = [
|
|
284
|
+
ex_idx
|
|
285
|
+
for ex_idx, score in best_scores.items()
|
|
286
|
+
if score < self.perfect_score
|
|
287
|
+
]
|
|
288
|
+
|
|
289
|
+
if not improvable:
|
|
290
|
+
# Best candidate is perfect on all examples it was evaluated on
|
|
291
|
+
# Just pick a random example to re-evaluate and potentially find issues
|
|
292
|
+
improvable = list(best_scores.keys())
|
|
293
|
+
if not improvable:
|
|
294
|
+
self._log(f"Iteration {iteration}: No examples to evaluate")
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
# Pick an example to focus on (prefer non-perfect ones)
|
|
298
|
+
focus_idx = self.rng.choice(improvable)
|
|
299
|
+
|
|
300
|
+
# Evaluate current candidate on focus example to get trajectory
|
|
301
|
+
focus_example = self.val_dataset[focus_idx]
|
|
302
|
+
results = await self._evaluate_batch(
|
|
303
|
+
[(focus_idx, focus_example)], current_values
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
if not results:
|
|
307
|
+
self._log(f"Iteration {iteration}: Failed to evaluate focus example")
|
|
308
|
+
return True
|
|
309
|
+
|
|
310
|
+
_, focus_result = results[0]
|
|
311
|
+
self.state.total_evals += 1
|
|
312
|
+
|
|
313
|
+
if self.save_trajectories:
|
|
314
|
+
self._save_trajectory(
|
|
315
|
+
iteration, "focus", current_values, focus_idx, focus_result
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Generate proposal
|
|
319
|
+
proposal = await propose_improvement(
|
|
320
|
+
proposer_client=self.proposer_client,
|
|
321
|
+
eval_result=focus_result,
|
|
322
|
+
components=self.components,
|
|
323
|
+
current_values=current_values,
|
|
324
|
+
prompt_template=self.proposal_prompt_template,
|
|
325
|
+
meta_instructions=self.meta_instructions,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if proposal is None:
|
|
329
|
+
self._log(f"Iteration {iteration}: No proposal generated")
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
self._log(
|
|
333
|
+
f"Iteration {iteration}: Proposing change to '{proposal.component_name}' - {proposal.reasoning[:80]}..."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Build new candidate
|
|
337
|
+
new_values = dict(current_values)
|
|
338
|
+
new_values[proposal.component_name] = proposal.new_value
|
|
339
|
+
|
|
340
|
+
# Evaluate on minibatch (including focus example)
|
|
341
|
+
minibatch_indices = [focus_idx]
|
|
342
|
+
other_indices = [i for i in improvable if i != focus_idx]
|
|
343
|
+
if other_indices:
|
|
344
|
+
additional = self.rng.sample(
|
|
345
|
+
other_indices, min(self.minibatch_size - 1, len(other_indices))
|
|
346
|
+
)
|
|
347
|
+
minibatch_indices.extend(additional)
|
|
348
|
+
|
|
349
|
+
# Evaluate old and new candidates on minibatch concurrently
|
|
350
|
+
minibatch = [(i, self.val_dataset[i]) for i in minibatch_indices]
|
|
351
|
+
old_results, new_results = await asyncio.gather(
|
|
352
|
+
self._evaluate_batch(minibatch, current_values),
|
|
353
|
+
self._evaluate_batch(minibatch, new_values),
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
old_sum = sum(r.score for _, r in old_results)
|
|
357
|
+
new_sum = sum(r.score for _, r in new_results)
|
|
358
|
+
|
|
359
|
+
self.state.total_evals += len(old_results) + len(new_results)
|
|
360
|
+
|
|
361
|
+
# Accept if improved
|
|
362
|
+
if new_sum <= old_sum:
|
|
363
|
+
self._log(
|
|
364
|
+
f"Iteration {iteration}: Rejected (old={old_sum:.3f}, new={new_sum:.3f})"
|
|
365
|
+
)
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
self._log(
|
|
369
|
+
f"Iteration {iteration}: Accepted (old={old_sum:.3f}, new={new_sum:.3f})"
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Full validation evaluation
|
|
373
|
+
val_scores = await self._evaluate_all(self.val_dataset, new_values)
|
|
374
|
+
|
|
375
|
+
# Add to population
|
|
376
|
+
new_idx = self.state.add_candidate(new_values, best_idx, val_scores)
|
|
377
|
+
|
|
378
|
+
new_avg = sum(val_scores.values()) / len(val_scores) if val_scores else 0.0
|
|
379
|
+
best_avg = self.state.get_candidate_avg_score(self.state.best_candidate_idx())
|
|
380
|
+
self._log(
|
|
381
|
+
f" New candidate {new_idx}: val_avg={new_avg:.4f}, best={best_avg:.4f}, "
|
|
382
|
+
f"pool={len(self.state.candidates)}"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Save state periodically
|
|
386
|
+
if self.run_dir and iteration % 10 == 0:
|
|
387
|
+
self.state.save(self.run_dir)
|
|
388
|
+
|
|
389
|
+
return True
|
|
390
|
+
|
|
391
|
+
async def run(self) -> GEPAResult:
|
|
392
|
+
"""Run optimization until stopping condition."""
|
|
393
|
+
if self.state is None:
|
|
394
|
+
await self.initialize()
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
from tqdm import tqdm
|
|
398
|
+
|
|
399
|
+
pbar = tqdm(
|
|
400
|
+
total=self.max_iterations,
|
|
401
|
+
desc="GEPA",
|
|
402
|
+
unit="iter",
|
|
403
|
+
)
|
|
404
|
+
except ImportError:
|
|
405
|
+
pbar = None
|
|
406
|
+
|
|
407
|
+
while await self.step():
|
|
408
|
+
if pbar:
|
|
409
|
+
pbar.update(1)
|
|
410
|
+
pbar.set_postfix(
|
|
411
|
+
evals=self.state.total_evals if self.state else 0,
|
|
412
|
+
best=f"{self.state.get_candidate_avg_score(self.state.best_candidate_idx()):.3f}"
|
|
413
|
+
if self.state
|
|
414
|
+
else 0,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
if pbar:
|
|
418
|
+
pbar.close()
|
|
419
|
+
|
|
420
|
+
# Save final state
|
|
421
|
+
if self.run_dir and self.state:
|
|
422
|
+
self.state.save(self.run_dir)
|
|
423
|
+
|
|
424
|
+
return self.result()
|
|
425
|
+
|
|
426
|
+
def result(self) -> GEPAResult:
|
|
427
|
+
"""Get current result as immutable snapshot."""
|
|
428
|
+
if self.state is None:
|
|
429
|
+
raise RuntimeError(
|
|
430
|
+
"Optimizer not initialized. Call initialize() or run() first."
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return GEPAResult.from_state(
|
|
434
|
+
self.state, run_dir=str(self.run_dir) if self.run_dir else None
|
|
435
|
+
)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Proposer for GEPA optimization.
|
|
3
|
+
|
|
4
|
+
The proposer analyzes a trajectory and proposes improvements to ONE component.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from lm_deluge.prompt import Conversation
|
|
10
|
+
|
|
11
|
+
from lm_deluge.pipelines.gepa.core import Component, EvalResult, Proposal
|
|
12
|
+
from lm_deluge.pipelines.gepa.util import (
|
|
13
|
+
extract_text_from_response,
|
|
14
|
+
format_components_for_prompt,
|
|
15
|
+
format_conversation_compact,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
DEFAULT_PROPOSAL_PROMPT = """You are optimizing an AI system by improving its text configuration.
|
|
20
|
+
|
|
21
|
+
## The Trajectory
|
|
22
|
+
|
|
23
|
+
Below is a conversation showing what the AI did on a task:
|
|
24
|
+
|
|
25
|
+
<trajectory>
|
|
26
|
+
{trajectory}
|
|
27
|
+
</trajectory>
|
|
28
|
+
|
|
29
|
+
## Feedback
|
|
30
|
+
|
|
31
|
+
{feedback}
|
|
32
|
+
|
|
33
|
+
## Components
|
|
34
|
+
|
|
35
|
+
These are the text components that control the AI's behavior. You can modify ONE of them:
|
|
36
|
+
|
|
37
|
+
{components}
|
|
38
|
+
{meta_instructions}
|
|
39
|
+
## Your Task
|
|
40
|
+
|
|
41
|
+
1. Analyze the trajectory to understand what went wrong (or could be better)
|
|
42
|
+
2. Identify which component is most likely responsible
|
|
43
|
+
3. Propose a specific improvement to that ONE component
|
|
44
|
+
|
|
45
|
+
Think about:
|
|
46
|
+
- Did the AI misunderstand the task? (maybe the system prompt needs clarity)
|
|
47
|
+
- Did it use tools incorrectly? (maybe tool descriptions need improvement)
|
|
48
|
+
- Did it miss important information? (maybe instructions need to be more explicit)
|
|
49
|
+
|
|
50
|
+
## Response Format
|
|
51
|
+
|
|
52
|
+
Respond with:
|
|
53
|
+
COMPONENT: <name of the component to change>
|
|
54
|
+
REASONING: <1-2 sentences on why this change will help>
|
|
55
|
+
NEW_VALUE:
|
|
56
|
+
```
|
|
57
|
+
<the complete improved text for this component>
|
|
58
|
+
```
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_proposal_prompt(
|
|
63
|
+
conversation: Conversation,
|
|
64
|
+
feedback: str,
|
|
65
|
+
components: dict[str, Component],
|
|
66
|
+
current_values: dict[str, str],
|
|
67
|
+
template: str | None = None,
|
|
68
|
+
meta_instructions: str | None = None,
|
|
69
|
+
) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Build the prompt for the proposer LLM.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
conversation: The trajectory to analyze
|
|
75
|
+
feedback: Feedback on the result
|
|
76
|
+
components: Component definitions (with descriptions)
|
|
77
|
+
current_values: Current text values for each component
|
|
78
|
+
template: Optional custom prompt template
|
|
79
|
+
meta_instructions: Optional instructions to guide the proposer's behavior
|
|
80
|
+
(e.g., "Focus on general improvements, don't overfit to specific examples")
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Formatted prompt string
|
|
84
|
+
"""
|
|
85
|
+
template = template or DEFAULT_PROPOSAL_PROMPT
|
|
86
|
+
|
|
87
|
+
# Format trajectory
|
|
88
|
+
trajectory_str = format_conversation_compact(conversation)
|
|
89
|
+
|
|
90
|
+
# Format components
|
|
91
|
+
descriptions = {name: comp.description for name, comp in components.items()}
|
|
92
|
+
components_str = format_components_for_prompt(current_values, descriptions)
|
|
93
|
+
|
|
94
|
+
# Format meta instructions
|
|
95
|
+
if meta_instructions:
|
|
96
|
+
meta_str = f"\n## Guidelines\n\n{meta_instructions}\n\n"
|
|
97
|
+
else:
|
|
98
|
+
meta_str = "\n"
|
|
99
|
+
|
|
100
|
+
return template.format(
|
|
101
|
+
trajectory=trajectory_str,
|
|
102
|
+
feedback=feedback,
|
|
103
|
+
components=components_str,
|
|
104
|
+
meta_instructions=meta_str,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_proposal_response(
|
|
109
|
+
response: str, valid_components: list[str]
|
|
110
|
+
) -> Proposal | None:
|
|
111
|
+
"""
|
|
112
|
+
Parse the proposer's response to extract the proposal.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
response: Raw LLM response
|
|
116
|
+
valid_components: List of valid component names
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Proposal if parsing succeeded, None otherwise
|
|
120
|
+
"""
|
|
121
|
+
# Find COMPONENT line
|
|
122
|
+
component_name = None
|
|
123
|
+
for line in response.split("\n"):
|
|
124
|
+
line = line.strip()
|
|
125
|
+
if line.upper().startswith("COMPONENT:"):
|
|
126
|
+
component_name = line.split(":", 1)[1].strip()
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
if not component_name:
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
# Validate component name
|
|
133
|
+
if component_name not in valid_components:
|
|
134
|
+
# Try case-insensitive match
|
|
135
|
+
for valid in valid_components:
|
|
136
|
+
if valid.lower() == component_name.lower():
|
|
137
|
+
component_name = valid
|
|
138
|
+
break
|
|
139
|
+
else:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
# Find REASONING line
|
|
143
|
+
reasoning = ""
|
|
144
|
+
for line in response.split("\n"):
|
|
145
|
+
line = line.strip()
|
|
146
|
+
if line.upper().startswith("REASONING:"):
|
|
147
|
+
reasoning = line.split(":", 1)[1].strip()
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
# Extract new value from code block
|
|
151
|
+
new_value = extract_text_from_response(response)
|
|
152
|
+
if not new_value:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
return Proposal(
|
|
156
|
+
component_name=component_name,
|
|
157
|
+
new_value=new_value,
|
|
158
|
+
reasoning=reasoning,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def propose_improvement(
|
|
163
|
+
proposer_client, # LLMClient
|
|
164
|
+
eval_result: EvalResult,
|
|
165
|
+
components: dict[str, Component],
|
|
166
|
+
current_values: dict[str, str],
|
|
167
|
+
prompt_template: str | None = None,
|
|
168
|
+
meta_instructions: str | None = None,
|
|
169
|
+
) -> Proposal | None:
|
|
170
|
+
"""
|
|
171
|
+
Use an LLM to propose an improvement to one component.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
proposer_client: LLMClient for generating proposals
|
|
175
|
+
eval_result: The evaluation result containing trajectory and feedback
|
|
176
|
+
components: Component definitions
|
|
177
|
+
current_values: Current text values
|
|
178
|
+
prompt_template: Optional custom prompt template
|
|
179
|
+
meta_instructions: Optional guidelines to steer the proposer
|
|
180
|
+
(e.g., "Don't overfit to specific examples")
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Proposal if successful, None otherwise
|
|
184
|
+
"""
|
|
185
|
+
# Build prompt
|
|
186
|
+
prompt = build_proposal_prompt(
|
|
187
|
+
conversation=eval_result.conversation,
|
|
188
|
+
feedback=eval_result.feedback,
|
|
189
|
+
components=components,
|
|
190
|
+
current_values=current_values,
|
|
191
|
+
template=prompt_template,
|
|
192
|
+
meta_instructions=meta_instructions,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Call LLM
|
|
196
|
+
response = await proposer_client.start(prompt)
|
|
197
|
+
if not response or not response.completion:
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
response_text = response.completion
|
|
201
|
+
|
|
202
|
+
# Parse response
|
|
203
|
+
valid_components = list(components.keys())
|
|
204
|
+
return parse_proposal_response(response_text, valid_components)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def propose_improvement_sync(
|
|
208
|
+
proposer_client, # LLMClient
|
|
209
|
+
eval_result: EvalResult,
|
|
210
|
+
components: dict[str, Component],
|
|
211
|
+
current_values: dict[str, str],
|
|
212
|
+
prompt_template: str | None = None,
|
|
213
|
+
) -> Proposal | None:
|
|
214
|
+
"""
|
|
215
|
+
Synchronous version of propose_improvement.
|
|
216
|
+
"""
|
|
217
|
+
# Build prompt
|
|
218
|
+
prompt = build_proposal_prompt(
|
|
219
|
+
conversation=eval_result.conversation,
|
|
220
|
+
feedback=eval_result.feedback,
|
|
221
|
+
components=components,
|
|
222
|
+
current_values=current_values,
|
|
223
|
+
template=prompt_template,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Call LLM
|
|
227
|
+
responses = proposer_client.process_prompts_sync([prompt], show_progress=False)
|
|
228
|
+
if not responses or not responses[0].completion:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
response_text = responses[0].completion
|
|
232
|
+
|
|
233
|
+
# Parse response
|
|
234
|
+
valid_components = list(components.keys())
|
|
235
|
+
return parse_proposal_response(response_text, valid_components)
|