lm-deluge 0.0.67__py3-none-any.whl → 0.0.90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (108) hide show
  1. lm_deluge/__init__.py +1 -2
  2. lm_deluge/api_requests/anthropic.py +117 -22
  3. lm_deluge/api_requests/base.py +84 -11
  4. lm_deluge/api_requests/bedrock.py +30 -6
  5. lm_deluge/api_requests/chat_reasoning.py +4 -0
  6. lm_deluge/api_requests/gemini.py +166 -20
  7. lm_deluge/api_requests/openai.py +145 -25
  8. lm_deluge/batches.py +15 -45
  9. lm_deluge/client.py +309 -50
  10. lm_deluge/config.py +15 -3
  11. lm_deluge/models/__init__.py +14 -1
  12. lm_deluge/models/anthropic.py +29 -14
  13. lm_deluge/models/arcee.py +16 -0
  14. lm_deluge/models/deepseek.py +36 -4
  15. lm_deluge/models/google.py +42 -0
  16. lm_deluge/models/grok.py +24 -0
  17. lm_deluge/models/kimi.py +36 -0
  18. lm_deluge/models/minimax.py +18 -0
  19. lm_deluge/models/openai.py +100 -0
  20. lm_deluge/models/openrouter.py +133 -7
  21. lm_deluge/models/together.py +11 -0
  22. lm_deluge/models/zai.py +50 -0
  23. lm_deluge/pipelines/gepa/__init__.py +95 -0
  24. lm_deluge/pipelines/gepa/core.py +354 -0
  25. lm_deluge/pipelines/gepa/docs/samples.py +705 -0
  26. lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
  27. lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
  28. lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
  29. lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
  30. lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
  31. lm_deluge/pipelines/gepa/optimizer.py +435 -0
  32. lm_deluge/pipelines/gepa/proposer.py +235 -0
  33. lm_deluge/pipelines/gepa/util.py +165 -0
  34. lm_deluge/{llm_tools → pipelines}/score.py +2 -2
  35. lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
  36. lm_deluge/prompt.py +537 -88
  37. lm_deluge/request_context.py +7 -2
  38. lm_deluge/server/__init__.py +24 -0
  39. lm_deluge/server/__main__.py +144 -0
  40. lm_deluge/server/adapters.py +369 -0
  41. lm_deluge/server/app.py +388 -0
  42. lm_deluge/server/auth.py +71 -0
  43. lm_deluge/server/model_policy.py +215 -0
  44. lm_deluge/server/models_anthropic.py +172 -0
  45. lm_deluge/server/models_openai.py +175 -0
  46. lm_deluge/tool/__init__.py +1130 -0
  47. lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
  48. lm_deluge/tool/builtin/anthropic/bash.py +0 -0
  49. lm_deluge/tool/builtin/anthropic/computer_use.py +0 -0
  50. lm_deluge/tool/builtin/gemini.py +59 -0
  51. lm_deluge/tool/builtin/openai.py +74 -0
  52. lm_deluge/tool/cua/__init__.py +173 -0
  53. lm_deluge/tool/cua/actions.py +148 -0
  54. lm_deluge/tool/cua/base.py +27 -0
  55. lm_deluge/tool/cua/batch.py +215 -0
  56. lm_deluge/tool/cua/converters.py +466 -0
  57. lm_deluge/tool/cua/kernel.py +702 -0
  58. lm_deluge/tool/cua/trycua.py +989 -0
  59. lm_deluge/tool/prefab/__init__.py +45 -0
  60. lm_deluge/tool/prefab/batch_tool.py +156 -0
  61. lm_deluge/tool/prefab/docs.py +1119 -0
  62. lm_deluge/tool/prefab/email.py +294 -0
  63. lm_deluge/tool/prefab/filesystem.py +1711 -0
  64. lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
  65. lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
  66. lm_deluge/tool/prefab/memory.py +458 -0
  67. lm_deluge/tool/prefab/otc/__init__.py +165 -0
  68. lm_deluge/tool/prefab/otc/executor.py +281 -0
  69. lm_deluge/tool/prefab/otc/parse.py +188 -0
  70. lm_deluge/tool/prefab/random.py +212 -0
  71. lm_deluge/tool/prefab/rlm/__init__.py +296 -0
  72. lm_deluge/tool/prefab/rlm/executor.py +349 -0
  73. lm_deluge/tool/prefab/rlm/parse.py +144 -0
  74. lm_deluge/tool/prefab/sandbox/__init__.py +19 -0
  75. lm_deluge/tool/prefab/sandbox/daytona_sandbox.py +483 -0
  76. lm_deluge/tool/prefab/sandbox/docker_sandbox.py +609 -0
  77. lm_deluge/tool/prefab/sandbox/fargate_sandbox.py +546 -0
  78. lm_deluge/tool/prefab/sandbox/modal_sandbox.py +469 -0
  79. lm_deluge/tool/prefab/sandbox/seatbelt_sandbox.py +827 -0
  80. lm_deluge/tool/prefab/sheets.py +385 -0
  81. lm_deluge/tool/prefab/skills.py +0 -0
  82. lm_deluge/tool/prefab/subagents.py +233 -0
  83. lm_deluge/tool/prefab/todos.py +342 -0
  84. lm_deluge/tool/prefab/tool_search.py +169 -0
  85. lm_deluge/tool/prefab/web_search.py +199 -0
  86. lm_deluge/tracker.py +16 -13
  87. lm_deluge/util/schema.py +412 -0
  88. lm_deluge/warnings.py +8 -0
  89. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/METADATA +23 -9
  90. lm_deluge-0.0.90.dist-info/RECORD +132 -0
  91. lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
  92. lm_deluge/built_in_tools/openai.py +0 -28
  93. lm_deluge/presets/cerebras.py +0 -17
  94. lm_deluge/presets/meta.py +0 -13
  95. lm_deluge/tool.py +0 -849
  96. lm_deluge-0.0.67.dist-info/RECORD +0 -72
  97. lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
  98. /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
  99. /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
  100. /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
  101. /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
  102. /lm_deluge/{built_in_tools/anthropic/bash.py → skills/anthropic.py} +0 -0
  103. /lm_deluge/{built_in_tools/anthropic/computer_use.py → skills/compat.py} +0 -0
  104. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
  105. /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
  106. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/WHEEL +0 -0
  107. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/licenses/LICENSE +0 -0
  108. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,435 @@
1
+ """
2
+ GEPA optimizer.
3
+
4
+ Main optimization loop that evolves text components using trajectory-based feedback.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ import random
12
+ from collections.abc import Awaitable, Callable
13
+ from pathlib import Path
14
+ from typing import Generic, TypeVar
15
+
16
+ from lm_deluge.client import _LLMClient
17
+ from lm_deluge.pipelines.gepa.core import (
18
+ Component,
19
+ EvalResult,
20
+ GEPAResult,
21
+ GEPAState,
22
+ )
23
+ from lm_deluge.pipelines.gepa.proposer import propose_improvement
24
+
25
+ T = TypeVar("T") # Example type
26
+
27
+ # Type for the user-provided evaluate function (must be async)
28
+ AsyncEvaluateFn = Callable[[_LLMClient, dict[str, str], T], Awaitable[EvalResult]]
29
+
30
+
31
+ def optimize(
32
+ components: dict[str, Component],
33
+ evaluate_fn: AsyncEvaluateFn[T],
34
+ dataset: list[T],
35
+ task_client: _LLMClient,
36
+ proposer_client: _LLMClient,
37
+ *,
38
+ val_dataset: list[T] | None = None,
39
+ max_iterations: int = 100,
40
+ max_evals: int | None = None,
41
+ minibatch_size: int = 4,
42
+ perfect_score: float = 1.0,
43
+ proposal_prompt_template: str | None = None,
44
+ meta_instructions: str | None = None,
45
+ run_dir: str | Path | None = None,
46
+ log_fn: Callable[[str], None] | None = None,
47
+ save_trajectories: bool = False,
48
+ seed: int = 0,
49
+ ) -> GEPAResult:
50
+ """
51
+ Run GEPA optimization to improve text components.
52
+
53
+ Args:
54
+ components: The text components to optimize (name -> Component)
55
+ evaluate_fn: Async function that evaluates one example: (client, values, example) -> EvalResult
56
+ dataset: Training examples
57
+ task_client: LLMClient for running evaluations
58
+ proposer_client: LLMClient for generating proposals
59
+ val_dataset: Optional separate validation set (defaults to dataset)
60
+ max_iterations: Maximum optimization iterations
61
+ max_evals: Maximum total evaluations (budget)
62
+ minibatch_size: Examples per minibatch for proposal evaluation
63
+ perfect_score: Score considered perfect (skip examples that achieve this)
64
+ proposal_prompt_template: Custom prompt template for proposer
65
+ meta_instructions: Guidelines to steer the proposer (e.g., "Don't overfit to specific examples")
66
+ run_dir: Directory to save state and trajectories
67
+ log_fn: Logging function (defaults to print)
68
+ save_trajectories: Whether to save trajectories to disk
69
+ seed: Random seed for reproducibility
70
+
71
+ Returns:
72
+ GEPAResult with optimization results
73
+ """
74
+ engine = GEPAEngine(
75
+ components=components,
76
+ evaluate_fn=evaluate_fn,
77
+ dataset=dataset,
78
+ task_client=task_client,
79
+ proposer_client=proposer_client,
80
+ val_dataset=val_dataset,
81
+ max_iterations=max_iterations,
82
+ max_evals=max_evals,
83
+ minibatch_size=minibatch_size,
84
+ perfect_score=perfect_score,
85
+ proposal_prompt_template=proposal_prompt_template,
86
+ meta_instructions=meta_instructions,
87
+ run_dir=run_dir,
88
+ log_fn=log_fn,
89
+ save_trajectories=save_trajectories,
90
+ seed=seed,
91
+ )
92
+
93
+ return asyncio.run(engine.run())
94
+
95
+
96
+ class GEPAEngine(Generic[T]):
97
+ """
98
+ Stateful GEPA optimizer.
99
+
100
+ Use this for more control over the optimization process:
101
+ - Resume from saved state
102
+ - Step through iterations manually
103
+ - Access intermediate state
104
+ """
105
+
106
+ def __init__(
107
+ self,
108
+ components: dict[str, Component],
109
+ evaluate_fn: AsyncEvaluateFn[T],
110
+ dataset: list[T],
111
+ task_client: _LLMClient,
112
+ proposer_client: _LLMClient,
113
+ *,
114
+ val_dataset: list[T] | None = None,
115
+ max_iterations: int = 100,
116
+ max_evals: int | None = None,
117
+ minibatch_size: int = 4,
118
+ perfect_score: float = 1.0,
119
+ proposal_prompt_template: str | None = None,
120
+ meta_instructions: str | None = None,
121
+ run_dir: str | Path | None = None,
122
+ log_fn: Callable[[str], None] | None = None,
123
+ save_trajectories: bool = False,
124
+ seed: int = 0,
125
+ ):
126
+ self.components = components
127
+ self.evaluate_fn = evaluate_fn
128
+ self.dataset = dataset
129
+ self.task_client = task_client
130
+ self.proposer_client = proposer_client
131
+ self.val_dataset = val_dataset if val_dataset is not None else dataset
132
+ self.max_iterations = max_iterations
133
+ self.max_evals = max_evals
134
+ self.minibatch_size = minibatch_size
135
+ self.perfect_score = perfect_score
136
+ self.proposal_prompt_template = proposal_prompt_template
137
+ self.meta_instructions = meta_instructions
138
+ self.run_dir = Path(run_dir) if run_dir else None
139
+ self.log_fn = log_fn or print
140
+ self.save_trajectories = save_trajectories
141
+
142
+ self.rng = random.Random(seed)
143
+ self.state: GEPAState | None = None
144
+ self._trajectory_counter = 0
145
+
146
+ def _log(self, msg: str) -> None:
147
+ self.log_fn(msg)
148
+
149
+ async def _evaluate_batch(
150
+ self,
151
+ examples: list[tuple[int, T]], # (index, example) pairs
152
+ component_values: dict[str, str],
153
+ ) -> list[tuple[int, EvalResult]]:
154
+ """Evaluate a batch of examples concurrently, return (index, result) pairs."""
155
+
156
+ async def eval_one(idx: int, example: T) -> tuple[int, EvalResult] | None:
157
+ try:
158
+ result = await self.evaluate_fn(
159
+ self.task_client, component_values, example
160
+ )
161
+ return (idx, result)
162
+ except Exception as e:
163
+ self._log(f"Error evaluating example {idx}: {e}")
164
+ return None
165
+
166
+ # Run all evaluations concurrently
167
+ tasks = [eval_one(idx, example) for idx, example in examples]
168
+ results_raw = await asyncio.gather(*tasks)
169
+
170
+ # Filter out None results (failed evaluations)
171
+ results = [r for r in results_raw if r is not None]
172
+ return results
173
+
174
+ async def _evaluate_all(
175
+ self,
176
+ examples: list[T],
177
+ component_values: dict[str, str],
178
+ ) -> dict[int, float]:
179
+ """Evaluate all examples, return scores dict."""
180
+ indexed = [(i, ex) for i, ex in enumerate(examples)]
181
+ results = await self._evaluate_batch(indexed, component_values)
182
+
183
+ if self.state:
184
+ self.state.total_evals += len(results)
185
+
186
+ return {idx: result.score for idx, result in results}
187
+
188
+ def _save_trajectory(
189
+ self,
190
+ iteration: int,
191
+ tag: str,
192
+ candidate_values: dict[str, str],
193
+ example_idx: int,
194
+ result: EvalResult,
195
+ ) -> None:
196
+ """Save a trajectory to disk for debugging."""
197
+ if not self.save_trajectories or not self.run_dir:
198
+ return
199
+
200
+ traj_dir = self.run_dir / "trajectories"
201
+ traj_dir.mkdir(parents=True, exist_ok=True)
202
+
203
+ self._trajectory_counter += 1
204
+ filename = f"{self._trajectory_counter:04d}_iter{iteration}_{tag}.json"
205
+
206
+ data = {
207
+ "iteration": iteration,
208
+ "tag": tag,
209
+ "example_idx": example_idx,
210
+ "candidate": candidate_values,
211
+ "score": result.score,
212
+ "feedback": result.feedback,
213
+ "conversation": result.conversation.to_log(),
214
+ }
215
+
216
+ try:
217
+ with open(traj_dir / filename, "w") as f:
218
+ json.dump(data, f, indent=2, default=str)
219
+ except Exception as e:
220
+ self._log(f"Failed to save trajectory: {e}")
221
+
222
+ async def initialize(self) -> None:
223
+ """Initialize state by evaluating seed candidate on validation set."""
224
+ self._log("Evaluating seed candidate...")
225
+
226
+ seed_values = {name: comp.value for name, comp in self.components.items()}
227
+ seed_scores = await self._evaluate_all(self.val_dataset, seed_values)
228
+
229
+ self.state = GEPAState.initialize(self.components, seed_scores)
230
+
231
+ avg_score = sum(seed_scores.values()) / len(seed_scores) if seed_scores else 0.0
232
+ self._log(
233
+ f"Seed candidate: avg_score={avg_score:.4f} on {len(seed_scores)} examples"
234
+ )
235
+
236
+ def _should_stop(self) -> bool:
237
+ """Check if we should stop optimization."""
238
+ if self.state is None:
239
+ return True
240
+
241
+ if self.state.iteration >= self.max_iterations:
242
+ return True
243
+
244
+ if self.max_evals and self.state.total_evals >= self.max_evals:
245
+ return True
246
+
247
+ # Stop if best candidate achieves perfect score on val set
248
+ best_idx = self.state.best_candidate_idx()
249
+ best_avg = self.state.get_candidate_avg_score(best_idx)
250
+ if best_avg >= self.perfect_score:
251
+ self._log(
252
+ f"Best candidate achieved perfect score ({best_avg:.4f}), stopping"
253
+ )
254
+ return True
255
+
256
+ return False
257
+
258
+ async def step(self) -> bool:
259
+ """
260
+ Run one optimization iteration.
261
+
262
+ Returns:
263
+ True if optimization should continue, False if done
264
+ """
265
+ if self.state is None:
266
+ await self.initialize()
267
+
268
+ assert self.state is not None
269
+
270
+ if self._should_stop():
271
+ return False
272
+
273
+ self.state.iteration += 1
274
+ iteration = self.state.iteration
275
+
276
+ # Get current best candidate
277
+ best_idx = self.state.best_candidate_idx()
278
+ current_values = self.state.candidates[best_idx]
279
+
280
+ # Find examples where the BEST candidate isn't perfect
281
+ # (not Pareto front - we want to improve the best single candidate)
282
+ best_scores = self.state.candidate_scores[best_idx]
283
+ improvable = [
284
+ ex_idx
285
+ for ex_idx, score in best_scores.items()
286
+ if score < self.perfect_score
287
+ ]
288
+
289
+ if not improvable:
290
+ # Best candidate is perfect on all examples it was evaluated on
291
+ # Just pick a random example to re-evaluate and potentially find issues
292
+ improvable = list(best_scores.keys())
293
+ if not improvable:
294
+ self._log(f"Iteration {iteration}: No examples to evaluate")
295
+ return False
296
+
297
+ # Pick an example to focus on (prefer non-perfect ones)
298
+ focus_idx = self.rng.choice(improvable)
299
+
300
+ # Evaluate current candidate on focus example to get trajectory
301
+ focus_example = self.val_dataset[focus_idx]
302
+ results = await self._evaluate_batch(
303
+ [(focus_idx, focus_example)], current_values
304
+ )
305
+
306
+ if not results:
307
+ self._log(f"Iteration {iteration}: Failed to evaluate focus example")
308
+ return True
309
+
310
+ _, focus_result = results[0]
311
+ self.state.total_evals += 1
312
+
313
+ if self.save_trajectories:
314
+ self._save_trajectory(
315
+ iteration, "focus", current_values, focus_idx, focus_result
316
+ )
317
+
318
+ # Generate proposal
319
+ proposal = await propose_improvement(
320
+ proposer_client=self.proposer_client,
321
+ eval_result=focus_result,
322
+ components=self.components,
323
+ current_values=current_values,
324
+ prompt_template=self.proposal_prompt_template,
325
+ meta_instructions=self.meta_instructions,
326
+ )
327
+
328
+ if proposal is None:
329
+ self._log(f"Iteration {iteration}: No proposal generated")
330
+ return True
331
+
332
+ self._log(
333
+ f"Iteration {iteration}: Proposing change to '{proposal.component_name}' - {proposal.reasoning[:80]}..."
334
+ )
335
+
336
+ # Build new candidate
337
+ new_values = dict(current_values)
338
+ new_values[proposal.component_name] = proposal.new_value
339
+
340
+ # Evaluate on minibatch (including focus example)
341
+ minibatch_indices = [focus_idx]
342
+ other_indices = [i for i in improvable if i != focus_idx]
343
+ if other_indices:
344
+ additional = self.rng.sample(
345
+ other_indices, min(self.minibatch_size - 1, len(other_indices))
346
+ )
347
+ minibatch_indices.extend(additional)
348
+
349
+ # Evaluate old and new candidates on minibatch concurrently
350
+ minibatch = [(i, self.val_dataset[i]) for i in minibatch_indices]
351
+ old_results, new_results = await asyncio.gather(
352
+ self._evaluate_batch(minibatch, current_values),
353
+ self._evaluate_batch(minibatch, new_values),
354
+ )
355
+
356
+ old_sum = sum(r.score for _, r in old_results)
357
+ new_sum = sum(r.score for _, r in new_results)
358
+
359
+ self.state.total_evals += len(old_results) + len(new_results)
360
+
361
+ # Accept if improved
362
+ if new_sum <= old_sum:
363
+ self._log(
364
+ f"Iteration {iteration}: Rejected (old={old_sum:.3f}, new={new_sum:.3f})"
365
+ )
366
+ return True
367
+
368
+ self._log(
369
+ f"Iteration {iteration}: Accepted (old={old_sum:.3f}, new={new_sum:.3f})"
370
+ )
371
+
372
+ # Full validation evaluation
373
+ val_scores = await self._evaluate_all(self.val_dataset, new_values)
374
+
375
+ # Add to population
376
+ new_idx = self.state.add_candidate(new_values, best_idx, val_scores)
377
+
378
+ new_avg = sum(val_scores.values()) / len(val_scores) if val_scores else 0.0
379
+ best_avg = self.state.get_candidate_avg_score(self.state.best_candidate_idx())
380
+ self._log(
381
+ f" New candidate {new_idx}: val_avg={new_avg:.4f}, best={best_avg:.4f}, "
382
+ f"pool={len(self.state.candidates)}"
383
+ )
384
+
385
+ # Save state periodically
386
+ if self.run_dir and iteration % 10 == 0:
387
+ self.state.save(self.run_dir)
388
+
389
+ return True
390
+
391
+ async def run(self) -> GEPAResult:
392
+ """Run optimization until stopping condition."""
393
+ if self.state is None:
394
+ await self.initialize()
395
+
396
+ try:
397
+ from tqdm import tqdm
398
+
399
+ pbar = tqdm(
400
+ total=self.max_iterations,
401
+ desc="GEPA",
402
+ unit="iter",
403
+ )
404
+ except ImportError:
405
+ pbar = None
406
+
407
+ while await self.step():
408
+ if pbar:
409
+ pbar.update(1)
410
+ pbar.set_postfix(
411
+ evals=self.state.total_evals if self.state else 0,
412
+ best=f"{self.state.get_candidate_avg_score(self.state.best_candidate_idx()):.3f}"
413
+ if self.state
414
+ else 0,
415
+ )
416
+
417
+ if pbar:
418
+ pbar.close()
419
+
420
+ # Save final state
421
+ if self.run_dir and self.state:
422
+ self.state.save(self.run_dir)
423
+
424
+ return self.result()
425
+
426
+ def result(self) -> GEPAResult:
427
+ """Get current result as immutable snapshot."""
428
+ if self.state is None:
429
+ raise RuntimeError(
430
+ "Optimizer not initialized. Call initialize() or run() first."
431
+ )
432
+
433
+ return GEPAResult.from_state(
434
+ self.state, run_dir=str(self.run_dir) if self.run_dir else None
435
+ )
@@ -0,0 +1,235 @@
1
+ """
2
+ Proposer for GEPA optimization.
3
+
4
+ The proposer analyzes a trajectory and proposes improvements to ONE component.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from lm_deluge.prompt import Conversation
10
+
11
+ from lm_deluge.pipelines.gepa.core import Component, EvalResult, Proposal
12
+ from lm_deluge.pipelines.gepa.util import (
13
+ extract_text_from_response,
14
+ format_components_for_prompt,
15
+ format_conversation_compact,
16
+ )
17
+
18
+
19
+ DEFAULT_PROPOSAL_PROMPT = """You are optimizing an AI system by improving its text configuration.
20
+
21
+ ## The Trajectory
22
+
23
+ Below is a conversation showing what the AI did on a task:
24
+
25
+ <trajectory>
26
+ {trajectory}
27
+ </trajectory>
28
+
29
+ ## Feedback
30
+
31
+ {feedback}
32
+
33
+ ## Components
34
+
35
+ These are the text components that control the AI's behavior. You can modify ONE of them:
36
+
37
+ {components}
38
+ {meta_instructions}
39
+ ## Your Task
40
+
41
+ 1. Analyze the trajectory to understand what went wrong (or could be better)
42
+ 2. Identify which component is most likely responsible
43
+ 3. Propose a specific improvement to that ONE component
44
+
45
+ Think about:
46
+ - Did the AI misunderstand the task? (maybe the system prompt needs clarity)
47
+ - Did it use tools incorrectly? (maybe tool descriptions need improvement)
48
+ - Did it miss important information? (maybe instructions need to be more explicit)
49
+
50
+ ## Response Format
51
+
52
+ Respond with:
53
+ COMPONENT: <name of the component to change>
54
+ REASONING: <1-2 sentences on why this change will help>
55
+ NEW_VALUE:
56
+ ```
57
+ <the complete improved text for this component>
58
+ ```
59
+ """
60
+
61
+
62
+ def build_proposal_prompt(
63
+ conversation: Conversation,
64
+ feedback: str,
65
+ components: dict[str, Component],
66
+ current_values: dict[str, str],
67
+ template: str | None = None,
68
+ meta_instructions: str | None = None,
69
+ ) -> str:
70
+ """
71
+ Build the prompt for the proposer LLM.
72
+
73
+ Args:
74
+ conversation: The trajectory to analyze
75
+ feedback: Feedback on the result
76
+ components: Component definitions (with descriptions)
77
+ current_values: Current text values for each component
78
+ template: Optional custom prompt template
79
+ meta_instructions: Optional instructions to guide the proposer's behavior
80
+ (e.g., "Focus on general improvements, don't overfit to specific examples")
81
+
82
+ Returns:
83
+ Formatted prompt string
84
+ """
85
+ template = template or DEFAULT_PROPOSAL_PROMPT
86
+
87
+ # Format trajectory
88
+ trajectory_str = format_conversation_compact(conversation)
89
+
90
+ # Format components
91
+ descriptions = {name: comp.description for name, comp in components.items()}
92
+ components_str = format_components_for_prompt(current_values, descriptions)
93
+
94
+ # Format meta instructions
95
+ if meta_instructions:
96
+ meta_str = f"\n## Guidelines\n\n{meta_instructions}\n\n"
97
+ else:
98
+ meta_str = "\n"
99
+
100
+ return template.format(
101
+ trajectory=trajectory_str,
102
+ feedback=feedback,
103
+ components=components_str,
104
+ meta_instructions=meta_str,
105
+ )
106
+
107
+
108
+ def parse_proposal_response(
109
+ response: str, valid_components: list[str]
110
+ ) -> Proposal | None:
111
+ """
112
+ Parse the proposer's response to extract the proposal.
113
+
114
+ Args:
115
+ response: Raw LLM response
116
+ valid_components: List of valid component names
117
+
118
+ Returns:
119
+ Proposal if parsing succeeded, None otherwise
120
+ """
121
+ # Find COMPONENT line
122
+ component_name = None
123
+ for line in response.split("\n"):
124
+ line = line.strip()
125
+ if line.upper().startswith("COMPONENT:"):
126
+ component_name = line.split(":", 1)[1].strip()
127
+ break
128
+
129
+ if not component_name:
130
+ return None
131
+
132
+ # Validate component name
133
+ if component_name not in valid_components:
134
+ # Try case-insensitive match
135
+ for valid in valid_components:
136
+ if valid.lower() == component_name.lower():
137
+ component_name = valid
138
+ break
139
+ else:
140
+ return None
141
+
142
+ # Find REASONING line
143
+ reasoning = ""
144
+ for line in response.split("\n"):
145
+ line = line.strip()
146
+ if line.upper().startswith("REASONING:"):
147
+ reasoning = line.split(":", 1)[1].strip()
148
+ break
149
+
150
+ # Extract new value from code block
151
+ new_value = extract_text_from_response(response)
152
+ if not new_value:
153
+ return None
154
+
155
+ return Proposal(
156
+ component_name=component_name,
157
+ new_value=new_value,
158
+ reasoning=reasoning,
159
+ )
160
+
161
+
162
+ async def propose_improvement(
163
+ proposer_client, # LLMClient
164
+ eval_result: EvalResult,
165
+ components: dict[str, Component],
166
+ current_values: dict[str, str],
167
+ prompt_template: str | None = None,
168
+ meta_instructions: str | None = None,
169
+ ) -> Proposal | None:
170
+ """
171
+ Use an LLM to propose an improvement to one component.
172
+
173
+ Args:
174
+ proposer_client: LLMClient for generating proposals
175
+ eval_result: The evaluation result containing trajectory and feedback
176
+ components: Component definitions
177
+ current_values: Current text values
178
+ prompt_template: Optional custom prompt template
179
+ meta_instructions: Optional guidelines to steer the proposer
180
+ (e.g., "Don't overfit to specific examples")
181
+
182
+ Returns:
183
+ Proposal if successful, None otherwise
184
+ """
185
+ # Build prompt
186
+ prompt = build_proposal_prompt(
187
+ conversation=eval_result.conversation,
188
+ feedback=eval_result.feedback,
189
+ components=components,
190
+ current_values=current_values,
191
+ template=prompt_template,
192
+ meta_instructions=meta_instructions,
193
+ )
194
+
195
+ # Call LLM
196
+ response = await proposer_client.start(prompt)
197
+ if not response or not response.completion:
198
+ return None
199
+
200
+ response_text = response.completion
201
+
202
+ # Parse response
203
+ valid_components = list(components.keys())
204
+ return parse_proposal_response(response_text, valid_components)
205
+
206
+
207
+ def propose_improvement_sync(
208
+ proposer_client, # LLMClient
209
+ eval_result: EvalResult,
210
+ components: dict[str, Component],
211
+ current_values: dict[str, str],
212
+ prompt_template: str | None = None,
213
+ ) -> Proposal | None:
214
+ """
215
+ Synchronous version of propose_improvement.
216
+ """
217
+ # Build prompt
218
+ prompt = build_proposal_prompt(
219
+ conversation=eval_result.conversation,
220
+ feedback=eval_result.feedback,
221
+ components=components,
222
+ current_values=current_values,
223
+ template=prompt_template,
224
+ )
225
+
226
+ # Call LLM
227
+ responses = proposer_client.process_prompts_sync([prompt], show_progress=False)
228
+ if not responses or not responses[0].completion:
229
+ return None
230
+
231
+ response_text = responses[0].completion
232
+
233
+ # Parse response
234
+ valid_components = list(components.keys())
235
+ return parse_proposal_response(response_text, valid_components)