deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/evaluate/configs.py +1 -1
  5. deepeval/evaluate/execute.py +4 -1
  6. deepeval/metrics/answer_relevancy/template.py +4 -4
  7. deepeval/metrics/argument_correctness/template.py +2 -2
  8. deepeval/metrics/bias/template.py +3 -3
  9. deepeval/metrics/contextual_precision/template.py +6 -6
  10. deepeval/metrics/contextual_recall/template.py +2 -2
  11. deepeval/metrics/contextual_relevancy/template.py +3 -3
  12. deepeval/metrics/conversation_completeness/template.py +2 -2
  13. deepeval/metrics/conversational_dag/templates.py +4 -4
  14. deepeval/metrics/conversational_g_eval/template.py +4 -3
  15. deepeval/metrics/dag/templates.py +4 -4
  16. deepeval/metrics/faithfulness/template.py +4 -4
  17. deepeval/metrics/hallucination/template.py +4 -4
  18. deepeval/metrics/misuse/template.py +2 -2
  19. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  20. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  21. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  23. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  24. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  25. deepeval/metrics/non_advice/template.py +2 -2
  26. deepeval/metrics/pii_leakage/template.py +2 -2
  27. deepeval/metrics/prompt_alignment/template.py +4 -4
  28. deepeval/metrics/role_violation/template.py +2 -2
  29. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  30. deepeval/metrics/toxicity/template.py +4 -4
  31. deepeval/metrics/turn_relevancy/template.py +2 -2
  32. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  33. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  34. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  35. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  36. deepeval/models/llms/anthropic_model.py +17 -5
  37. deepeval/models/llms/azure_model.py +30 -18
  38. deepeval/models/llms/deepseek_model.py +22 -12
  39. deepeval/models/llms/gemini_model.py +120 -87
  40. deepeval/models/llms/grok_model.py +23 -16
  41. deepeval/models/llms/kimi_model.py +23 -12
  42. deepeval/models/llms/litellm_model.py +63 -25
  43. deepeval/models/llms/local_model.py +26 -18
  44. deepeval/models/llms/ollama_model.py +17 -7
  45. deepeval/models/llms/openai_model.py +22 -17
  46. deepeval/models/llms/portkey_model.py +132 -0
  47. deepeval/models/mlllms/azure_model.py +28 -19
  48. deepeval/models/mlllms/gemini_model.py +102 -73
  49. deepeval/models/mlllms/ollama_model.py +40 -9
  50. deepeval/models/mlllms/openai_model.py +65 -14
  51. deepeval/models/utils.py +48 -3
  52. deepeval/optimization/__init__.py +13 -0
  53. deepeval/optimization/adapters/__init__.py +2 -0
  54. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  55. deepeval/optimization/aggregates.py +14 -0
  56. deepeval/optimization/configs.py +34 -0
  57. deepeval/optimization/copro/configs.py +31 -0
  58. deepeval/optimization/copro/loop.py +837 -0
  59. deepeval/optimization/gepa/__init__.py +7 -0
  60. deepeval/optimization/gepa/configs.py +115 -0
  61. deepeval/optimization/gepa/loop.py +677 -0
  62. deepeval/optimization/miprov2/configs.py +134 -0
  63. deepeval/optimization/miprov2/loop.py +785 -0
  64. deepeval/optimization/mutations/__init__.py +0 -0
  65. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  66. deepeval/optimization/policies/__init__.py +16 -0
  67. deepeval/optimization/policies/selection.py +166 -0
  68. deepeval/optimization/policies/tie_breaker.py +67 -0
  69. deepeval/optimization/prompt_optimizer.py +462 -0
  70. deepeval/optimization/simba/__init__.py +0 -0
  71. deepeval/optimization/simba/configs.py +33 -0
  72. deepeval/optimization/simba/loop.py +983 -0
  73. deepeval/optimization/simba/types.py +15 -0
  74. deepeval/optimization/types.py +361 -0
  75. deepeval/optimization/utils.py +598 -0
  76. deepeval/prompt/prompt.py +10 -5
  77. deepeval/test_run/cache.py +2 -0
  78. deepeval/test_run/test_run.py +6 -1
  79. deepeval/utils.py +24 -0
  80. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  81. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
  82. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  83. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
  84. {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,677 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+ import random
4
+ import time
5
+
6
+ from typing import (
7
+ Awaitable,
8
+ Callable,
9
+ Dict,
10
+ List,
11
+ Tuple,
12
+ TYPE_CHECKING,
13
+ Union,
14
+ Optional,
15
+ )
16
+
17
+ from deepeval.errors import DeepEvalError
18
+ from deepeval.optimization.aggregates import Aggregator, mean_of_all
19
+ from deepeval.optimization.types import (
20
+ AcceptedIterationDict,
21
+ PromptConfiguration,
22
+ PromptConfigurationId,
23
+ ModuleId,
24
+ ScoreTable,
25
+ ScoringAdapter,
26
+ OptimizationResult,
27
+ RunnerStatusType,
28
+ RunnerStatusCallbackProtocol,
29
+ )
30
+ from deepeval.optimization.utils import (
31
+ split_goldens,
32
+ build_prompt_config_snapshots,
33
+ )
34
+ from deepeval.optimization.policies import (
35
+ pick_best_with_ties,
36
+ select_prompt_configuration_pareto,
37
+ )
38
+ from deepeval.prompt.api import PromptType
39
+ from deepeval.prompt.prompt import Prompt
40
+ from deepeval.optimization.mutations.prompt_rewriter import (
41
+ PromptRewriter,
42
+ )
43
+ from .configs import GEPAConfig
44
+
45
+
46
+ if TYPE_CHECKING:
47
+ from deepeval.dataset.golden import Golden, ConversationalGolden
48
+
49
+
50
+ class GEPARunner:
51
+ """
52
+ GEPA loop with sync/async execution.
53
+
54
+ This runner is intentionally low level and does not know about metrics,
55
+ models, or async configs. It relies on a preconfigured
56
+ ScoringAdapter and PromptRewriter, which are typically constructed by
57
+ the higher-level PromptOptimizer.
58
+ """
59
+
60
+ SINGLE_MODULE_ID: ModuleId = "__module__"
61
+
62
+ def __init__(
63
+ self,
64
+ *,
65
+ config: GEPAConfig,
66
+ aggregate_instances: Aggregator = mean_of_all,
67
+ scoring_adapter: Optional[ScoringAdapter] = None,
68
+ ) -> None:
69
+ self.config = config
70
+ self.aggregate_instances = aggregate_instances
71
+ self.scoring_adapter = scoring_adapter
72
+
73
+ # random seeded from config is used for splits, sampling, and tie-breaking.
74
+ self.random_state = random.Random(config.random_seed)
75
+
76
+ # runtime state to be reset between runs
77
+ self.reset_state()
78
+
79
+ # Status callback set by PromptOptimizer:
80
+ # (kind, step_index, total_steps, detail) -> None
81
+ self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
82
+
83
+ # Model callback used by the rewriter set by PromptOptimizer.
84
+ self.model_callback: Optional[
85
+ Callable[
86
+ ...,
87
+ Union[
88
+ str,
89
+ Dict,
90
+ Tuple[Union[str, Dict], float],
91
+ ],
92
+ ]
93
+ ] = None
94
+
95
+ # lazy loaded
96
+ self._rewriter: Optional[PromptRewriter] = None
97
+
98
+ ##############
99
+ # Public API #
100
+ ##############
101
+
102
+ def execute(
103
+ self,
104
+ *,
105
+ prompt: Prompt,
106
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
107
+ ) -> Tuple[Prompt, Dict]:
108
+ """Synchronous GEPA run from a full list of goldens (splits internally)."""
109
+ total_goldens = len(goldens)
110
+ if total_goldens < 2:
111
+ raise DeepEvalError(
112
+ "GEPA prompt optimization requires at least 2 goldens, but "
113
+ f"received {total_goldens}. Provide at least two goldens to "
114
+ "run the optimizer."
115
+ )
116
+
117
+ self._ensure_scoring_adapter()
118
+ self._ensure_rewriter()
119
+ self.reset_state()
120
+
121
+ d_feedback, d_pareto = split_goldens(
122
+ goldens, self.config.pareto_size, random_state=self.random_state
123
+ )
124
+
125
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
126
+ root_prompt_configuration = PromptConfiguration.new(
127
+ prompts=dict(seed_prompts_by_module)
128
+ )
129
+ self._add_prompt_configuration(root_prompt_configuration)
130
+
131
+ accepted_iterations: List[Dict] = []
132
+
133
+ def _one_iteration() -> bool:
134
+ nonlocal accepted_iterations
135
+
136
+ if not d_feedback:
137
+ return False
138
+
139
+ # Seed Pareto scores lazily on first iteration
140
+ if not self.pareto_score_table:
141
+ self.pareto_score_table[root_prompt_configuration.id] = (
142
+ self.scoring_adapter.score_on_pareto(
143
+ root_prompt_configuration, d_pareto
144
+ )
145
+ )
146
+
147
+ # 1. Pick prompt_configuration via Pareto
148
+ parent_prompt_configuration = self._pick_prompt_configuration()
149
+
150
+ # 2. Single module id
151
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
152
+
153
+ # 3. Draw minibatch
154
+ minibatch = self._draw_minibatch(d_feedback)
155
+
156
+ # 4. Feedback
157
+ feedback_text = self.scoring_adapter.minibatch_feedback(
158
+ parent_prompt_configuration, selected_module_id, minibatch
159
+ )
160
+
161
+ # 5. Rewrite
162
+ child_prompt = self._generate_child_prompt(
163
+ selected_module_id, parent_prompt_configuration, feedback_text
164
+ )
165
+ if child_prompt is None:
166
+ # Child prompt matched parent; skip this iteration.
167
+ return True
168
+
169
+ # 6. Child prompt_configuration
170
+ child_prompt_configuration = self._make_child(
171
+ selected_module_id, parent_prompt_configuration, child_prompt
172
+ )
173
+
174
+ # 7. Evaluate parent/child on minibatch
175
+ parent_score = self.scoring_adapter.minibatch_score(
176
+ parent_prompt_configuration, minibatch
177
+ )
178
+ child_score = self.scoring_adapter.minibatch_score(
179
+ child_prompt_configuration, minibatch
180
+ )
181
+
182
+ # 8. Acceptance test
183
+ if self._should_accept_child(parent_score, child_score):
184
+ accepted_iterations.append(
185
+ self._accept_child(
186
+ selected_module_id,
187
+ parent_prompt_configuration,
188
+ child_prompt_configuration,
189
+ d_pareto,
190
+ parent_score,
191
+ child_score,
192
+ )
193
+ )
194
+
195
+ return True
196
+
197
+ self._run_loop_iteration(_one_iteration)
198
+ best = self._best_by_aggregate()
199
+ prompt_config_snapshots = build_prompt_config_snapshots(
200
+ self.prompt_configurations_by_id
201
+ )
202
+ report = OptimizationResult(
203
+ optimization_id=self.optimization_id,
204
+ best_id=best.id,
205
+ accepted_iterations=accepted_iterations,
206
+ pareto_scores=self.pareto_score_table,
207
+ parents=self.parents_by_id,
208
+ prompt_configurations=prompt_config_snapshots,
209
+ )
210
+ return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
211
+
212
+ async def a_execute(
213
+ self,
214
+ *,
215
+ prompt: Prompt,
216
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
217
+ ) -> Tuple[Prompt, Dict]:
218
+ """Asynchronous twin of execute_gepa()."""
219
+ total_goldens = len(goldens)
220
+ if total_goldens < 2:
221
+ raise DeepEvalError(
222
+ "GEPA prompt optimization requires at least 2 goldens, but "
223
+ f"received {total_goldens}. Provide at least two goldens to "
224
+ "run the optimizer."
225
+ )
226
+
227
+ self._ensure_scoring_adapter()
228
+ self._ensure_rewriter()
229
+ self.reset_state()
230
+
231
+ d_feedback, d_pareto = split_goldens(
232
+ goldens, self.config.pareto_size, random_state=self.random_state
233
+ )
234
+
235
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
236
+ root_prompt_configuration = PromptConfiguration.new(
237
+ prompts=dict(seed_prompts_by_module)
238
+ )
239
+ self._add_prompt_configuration(root_prompt_configuration)
240
+
241
+ accepted_iterations: List[Dict] = []
242
+
243
+ async def _one_iteration() -> bool:
244
+ nonlocal accepted_iterations
245
+
246
+ if not d_feedback:
247
+ return False
248
+
249
+ # Seed Pareto scores lazily on first iteration
250
+ if not self.pareto_score_table:
251
+ self.pareto_score_table[root_prompt_configuration.id] = (
252
+ await self.scoring_adapter.a_score_on_pareto(
253
+ root_prompt_configuration, d_pareto
254
+ )
255
+ )
256
+
257
+ # 1. Pick prompt_configuration via Pareto
258
+ parent_prompt_configuration = self._pick_prompt_configuration()
259
+
260
+ # 2. Single module id
261
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
262
+
263
+ # 3. Draw minibatch
264
+ minibatch = self._draw_minibatch(d_feedback)
265
+
266
+ # 4. Feedback
267
+ feedback_text = await self.scoring_adapter.a_minibatch_feedback(
268
+ parent_prompt_configuration, selected_module_id, minibatch
269
+ )
270
+
271
+ # 5. Rewrite
272
+ child_prompt = await self._a_generate_child_prompt(
273
+ selected_module_id, parent_prompt_configuration, feedback_text
274
+ )
275
+ if child_prompt is None:
276
+ # Child prompt matched parent; skip this iteration.
277
+ return True
278
+
279
+ # 6. Child prompt_configuration
280
+ child_prompt_configuration = self._make_child(
281
+ selected_module_id, parent_prompt_configuration, child_prompt
282
+ )
283
+
284
+ # 7. Evaluate parent/child on minibatch
285
+ parent_score = await self.scoring_adapter.a_minibatch_score(
286
+ parent_prompt_configuration, minibatch
287
+ )
288
+ child_score = await self.scoring_adapter.a_minibatch_score(
289
+ child_prompt_configuration, minibatch
290
+ )
291
+
292
+ # 8. Acceptance test
293
+ if self._should_accept_child(parent_score, child_score):
294
+ accepted_iterations.append(
295
+ await self._a_accept_child(
296
+ selected_module_id,
297
+ parent_prompt_configuration,
298
+ child_prompt_configuration,
299
+ d_pareto,
300
+ parent_score,
301
+ child_score,
302
+ )
303
+ )
304
+ return True
305
+
306
+ await self._a_run_loop_iteration(_one_iteration)
307
+ best = self._best_by_aggregate()
308
+ prompt_config_snapshots = build_prompt_config_snapshots(
309
+ self.prompt_configurations_by_id
310
+ )
311
+ report = OptimizationResult(
312
+ optimization_id=self.optimization_id,
313
+ best_id=best.id,
314
+ accepted_iterations=accepted_iterations,
315
+ pareto_scores=self.pareto_score_table,
316
+ parents=self.parents_by_id,
317
+ prompt_configurations=prompt_config_snapshots,
318
+ )
319
+ return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
320
+
321
+ ###################
322
+ # State & helpers #
323
+ ###################
324
+
325
+ def reset_state(self) -> None:
326
+ self.optimization_id = str(uuid.uuid4())
327
+ self.prompt_configurations_by_id: Dict[
328
+ PromptConfigurationId, PromptConfiguration
329
+ ] = {}
330
+ self.parents_by_id: Dict[
331
+ PromptConfigurationId, Optional[PromptConfigurationId]
332
+ ] = {}
333
+ self.pareto_score_table: ScoreTable = {}
334
+
335
+ def _ensure_scoring_adapter(self) -> None:
336
+ if self.scoring_adapter is None:
337
+ raise DeepEvalError(
338
+ "GEPARunner requires a `scoring_adapter`. "
339
+ "Construct one (for example, DeepEvalScoringAdapter) in "
340
+ "PromptOptimizer and assign it to `runner.scoring_adapter`."
341
+ )
342
+
343
+ def _ensure_rewriter(self) -> None:
344
+ if self._rewriter is not None:
345
+ return
346
+
347
+ # For now, always use the basic PromptRewriter. Additional
348
+ # variants (e.g. for GEPA Alg. 4 crossover) can be introduced
349
+ # later
350
+ self._rewriter = PromptRewriter()
351
+
352
+ def _prompts_equivalent(
353
+ self, old_prompt: Prompt, new_prompt: Prompt
354
+ ) -> bool:
355
+ """
356
+ Compare two Prompts for GEPA acceptance purposes.
357
+
358
+ This is used as:
359
+ if self._prompts_equivalent(old, new):
360
+ # reject child (treat as "no change")
361
+ return None
362
+
363
+ So:
364
+ - Return True: "do not accept this child"
365
+ - Return False: "child is meaningfully different"
366
+
367
+ Rules:
368
+ - If the types must be the same for this check to be meaningful
369
+ - For TEXT: compare text_template with whitespace trimmed
370
+ - For LIST: compare messages_template (length, role, and content,
371
+ with content whitespace trimmed).
372
+ """
373
+
374
+ # LIST prompts: compare messages
375
+ if new_prompt.type == PromptType.LIST:
376
+ old_msgs = old_prompt.messages_template
377
+ new_msgs = new_prompt.messages_template
378
+ if len(old_msgs) != len(new_msgs):
379
+ return False
380
+
381
+ for old_msg, new_msg in zip(old_msgs, new_msgs):
382
+ if old_msg.role != new_msg.role:
383
+ return False
384
+ if (old_msg.content or "").strip() != (
385
+ new_msg.content or ""
386
+ ).strip():
387
+ return False
388
+
389
+ return True
390
+
391
+ # TEXT prompts: compare text_template
392
+ old_txt = (old_prompt.text_template or "").strip()
393
+ new_txt = (new_prompt.text_template or "").strip()
394
+ return new_txt == old_txt
395
+
396
+ def _add_prompt_configuration(
397
+ self, prompt_configuration: PromptConfiguration
398
+ ) -> None:
399
+ self.prompt_configurations_by_id[prompt_configuration.id] = (
400
+ prompt_configuration
401
+ )
402
+ self.parents_by_id[prompt_configuration.id] = (
403
+ prompt_configuration.parent
404
+ )
405
+
406
+ def _best_by_aggregate(self) -> PromptConfiguration:
407
+ totals = {
408
+ prompt_configuration_id: self.aggregate_instances(vector)
409
+ for prompt_configuration_id, vector in self.pareto_score_table.items()
410
+ }
411
+
412
+ chosen, tied, max_val = pick_best_with_ties(
413
+ totals,
414
+ self.parents_by_id,
415
+ random_state=self.random_state,
416
+ tie_tolerance=float(self.config.tie_tolerance),
417
+ policy=self.config.tie_breaker,
418
+ )
419
+ if self.status_callback is not None and len(tied) > 1:
420
+ msg = (
421
+ f"tie on aggregate={max_val:.4f} among {len(tied)} "
422
+ f"prompt_configurations; using tie_breaker="
423
+ f"{self.config.tie_breaker.value!r} selected {chosen}. "
424
+ f"To change, set GEPAConfig.tie_breaker to one of: "
425
+ f"{[t.value for t in self.config.TieBreaker]} "
426
+ f"(tie_tolerance={float(self.config.tie_tolerance):g})."
427
+ )
428
+ self.status_callback(
429
+ RunnerStatusType.TIE,
430
+ detail=msg,
431
+ )
432
+
433
+ return self.prompt_configurations_by_id[chosen]
434
+
435
+ def _pick_prompt_configuration(self) -> PromptConfiguration:
436
+ selected_prompt_configuration_id = select_prompt_configuration_pareto(
437
+ self.pareto_score_table, random_state=self.random_state
438
+ )
439
+ return self.prompt_configurations_by_id[
440
+ selected_prompt_configuration_id
441
+ ]
442
+
443
+ def _draw_minibatch(
444
+ self, d_feedback: Union[List["Golden"], List["ConversationalGolden"]]
445
+ ) -> Union[List["Golden"], List["ConversationalGolden"]]:
446
+ # Determine effective minibatch size from GEPAConfig, bounded by the
447
+ # available feedback set.
448
+ n_feedback = len(d_feedback)
449
+ if n_feedback <= 0:
450
+ return []
451
+
452
+ if self.config.minibatch_size is not None:
453
+ size = self.config.minibatch_size
454
+ else:
455
+ # Dynamic sizing from ratio, bounded between min and max.
456
+ dynamic = max(
457
+ 1, int(round(n_feedback * self.config.minibatch_ratio))
458
+ )
459
+ size = max(
460
+ self.config.minibatch_min_size,
461
+ min(dynamic, self.config.minibatch_max_size),
462
+ )
463
+
464
+ size = max(1, min(size, n_feedback))
465
+
466
+ return [
467
+ d_feedback[self.random_state.randrange(0, n_feedback)]
468
+ for _ in range(size)
469
+ ]
470
+
471
+ async def _a_generate_child_prompt(
472
+ self,
473
+ selected_module_id: ModuleId,
474
+ parent_prompt_configuration: PromptConfiguration,
475
+ feedback_text: str,
476
+ ) -> Optional[Prompt]:
477
+ old_prompt = parent_prompt_configuration.prompts.get(
478
+ selected_module_id, Prompt(text_template="")
479
+ )
480
+
481
+ new_prompt = await self._rewriter.a_rewrite(
482
+ model_callback=self.model_callback,
483
+ module_id=selected_module_id,
484
+ old_prompt=old_prompt,
485
+ feedback_text=feedback_text,
486
+ )
487
+
488
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
489
+ old_prompt, new_prompt
490
+ ):
491
+ # don't accept if new prompt is the same as parent
492
+ # or if the type somehow changed
493
+ return None
494
+ return new_prompt
495
+
496
+ def _generate_child_prompt(
497
+ self,
498
+ selected_module_id: ModuleId,
499
+ parent_prompt_configuration: PromptConfiguration,
500
+ feedback_text: str,
501
+ ) -> Optional[Prompt]:
502
+ old_prompt = parent_prompt_configuration.prompts.get(
503
+ selected_module_id, Prompt(text_template="")
504
+ )
505
+
506
+ new_prompt = self._rewriter.rewrite(
507
+ model_callback=self.model_callback,
508
+ module_id=selected_module_id,
509
+ old_prompt=old_prompt,
510
+ feedback_text=feedback_text,
511
+ )
512
+
513
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
514
+ old_prompt, new_prompt
515
+ ):
516
+ # don't accept if new prompt is the same as parent
517
+ # or if the type somehow changed
518
+ return None
519
+ return new_prompt
520
+
521
+ def _make_child(
522
+ self,
523
+ selected_module_id: ModuleId,
524
+ parent_prompt_configuration: PromptConfiguration,
525
+ child_prompt: Prompt,
526
+ ) -> PromptConfiguration:
527
+ child_prompt_configuration = PromptConfiguration.new(
528
+ prompts=dict(parent_prompt_configuration.prompts),
529
+ parent=parent_prompt_configuration.id,
530
+ )
531
+ child_prompt_configuration.prompts[selected_module_id] = child_prompt
532
+ return child_prompt_configuration
533
+
534
+ def _should_accept_child(
535
+ self, parent_score: float, child_score: float
536
+ ) -> bool:
537
+ jitter = 1e-6
538
+ return child_score >= parent_score + max(self.config.min_delta, jitter)
539
+
540
+ def _accept_child(
541
+ self,
542
+ selected_module_id: ModuleId,
543
+ parent_prompt_configuration: PromptConfiguration,
544
+ child_prompt_configuration: PromptConfiguration,
545
+ d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
546
+ parent_score: float,
547
+ child_score: float,
548
+ ) -> AcceptedIterationDict:
549
+ self._add_prompt_configuration(child_prompt_configuration)
550
+ self.pareto_score_table[child_prompt_configuration.id] = (
551
+ self.scoring_adapter.score_on_pareto(
552
+ child_prompt_configuration, d_pareto
553
+ )
554
+ )
555
+
556
+ return AcceptedIterationDict(
557
+ parent=parent_prompt_configuration.id,
558
+ child=child_prompt_configuration.id,
559
+ module=selected_module_id,
560
+ before=parent_score,
561
+ after=child_score,
562
+ )
563
+
564
+ async def _a_accept_child(
565
+ self,
566
+ selected_module_id: ModuleId,
567
+ parent_prompt_configuration: PromptConfiguration,
568
+ child_prompt_configuration: PromptConfiguration,
569
+ d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
570
+ parent_score: float,
571
+ child_score: float,
572
+ ) -> AcceptedIterationDict:
573
+ self._add_prompt_configuration(child_prompt_configuration)
574
+ self.pareto_score_table[child_prompt_configuration.id] = (
575
+ await self.scoring_adapter.a_score_on_pareto(
576
+ child_prompt_configuration, d_pareto
577
+ )
578
+ )
579
+
580
+ return AcceptedIterationDict(
581
+ parent=parent_prompt_configuration.id,
582
+ child=child_prompt_configuration.id,
583
+ module=selected_module_id,
584
+ before=parent_score,
585
+ after=child_score,
586
+ )
587
+
588
+ def _update_progress(
589
+ self,
590
+ total_iterations: int,
591
+ iteration: int,
592
+ remaining_iterations: int,
593
+ elapsed: float,
594
+ ):
595
+ if self.status_callback is not None:
596
+ detail = (
597
+ f"(iterations={total_iterations}) "
598
+ f"• iteration {iteration}/{total_iterations} "
599
+ f"• {elapsed:.2f}s • remaining={remaining_iterations}"
600
+ )
601
+ self.status_callback(
602
+ RunnerStatusType.PROGRESS,
603
+ step_index=iteration,
604
+ total_steps=total_iterations,
605
+ detail=detail,
606
+ )
607
+
608
+ def _update_error(
609
+ self, total_iterations: int, iteration: int, exc: Exception
610
+ ):
611
+ # Report a user facing error event
612
+ if self.status_callback is not None:
613
+ detail = (
614
+ f"(iterations={total_iterations}) "
615
+ f"• error {exc.__class__.__name__}: {exc} "
616
+ f"• halted at iteration {iteration}"
617
+ )
618
+ self.status_callback(
619
+ RunnerStatusType.ERROR,
620
+ step_index=iteration,
621
+ total_steps=total_iterations,
622
+ detail=detail,
623
+ )
624
+
625
+ def _run_loop_iteration(
626
+ self,
627
+ gepa_iteration: Callable[[], bool],
628
+ ) -> None:
629
+ total_iterations = self.config.iterations
630
+ remaining_iterations = total_iterations
631
+ iteration = 0
632
+ self._update_progress(
633
+ total_iterations, iteration, remaining_iterations, 0
634
+ )
635
+ while remaining_iterations > 0:
636
+ iteration += 1
637
+ start_time = time.perf_counter()
638
+ try:
639
+ ok = gepa_iteration()
640
+ except Exception as exc:
641
+ # Report a user facing error event and halt optimization.
642
+ self._update_error(total_iterations, iteration, exc)
643
+ break
644
+ elapsed = time.perf_counter() - start_time
645
+ if not ok:
646
+ break
647
+ remaining_iterations -= 1
648
+ self._update_progress(
649
+ total_iterations, iteration, remaining_iterations, elapsed
650
+ )
651
+
652
+ async def _a_run_loop_iteration(
653
+ self,
654
+ a_gepa_iteration: Callable[[], Awaitable[bool]],
655
+ ) -> None:
656
+ total_iterations = self.config.iterations
657
+ remaining_iterations = total_iterations
658
+ iteration = 0
659
+ self._update_progress(
660
+ total_iterations, iteration, remaining_iterations, 0
661
+ )
662
+ while remaining_iterations > 0:
663
+ iteration += 1
664
+ start_time = time.perf_counter()
665
+ try:
666
+ ok = await a_gepa_iteration()
667
+ except Exception as exc:
668
+ # Report a user facing error event and halt optimization.
669
+ self._update_error(total_iterations, iteration, exc)
670
+ break
671
+ elapsed = time.perf_counter() - start_time
672
+ if not ok:
673
+ break
674
+ remaining_iterations -= 1
675
+ self._update_progress(
676
+ total_iterations, iteration, remaining_iterations, elapsed
677
+ )