deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,737 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+ import random
4
+ import time
5
+
6
+ from typing import (
7
+ Awaitable,
8
+ Callable,
9
+ Dict,
10
+ List,
11
+ Tuple,
12
+ TYPE_CHECKING,
13
+ Union,
14
+ Optional,
15
+ )
16
+
17
+ from deepeval.models.base_model import DeepEvalBaseLLM
18
+
19
+ from deepeval.errors import DeepEvalError
20
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
21
+ from deepeval.optimizer.types import (
22
+ AcceptedIterationDict,
23
+ PromptConfiguration,
24
+ PromptConfigurationId,
25
+ ModuleId,
26
+ ScoreTable,
27
+ OptimizationReport,
28
+ RunnerStatusType,
29
+ RunnerStatusCallback,
30
+ )
31
+ from deepeval.optimizer.scorer.base import BaseScorer
32
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
33
+ from deepeval.optimizer.utils import (
34
+ split_goldens,
35
+ build_prompt_config_snapshots,
36
+ )
37
+ from deepeval.optimizer.policies import (
38
+ pick_best_with_ties,
39
+ select_prompt_configuration_pareto,
40
+ frequency_weights,
41
+ pareto_frontier,
42
+ )
43
+ from deepeval.prompt.api import PromptType
44
+ from deepeval.prompt.prompt import Prompt
45
+ from deepeval.optimizer.rewriter import Rewriter
46
+ from deepeval.optimizer.policies import TieBreaker
47
+ from deepeval.optimizer.algorithms.configs import (
48
+ GEPA_MIN_DELTA,
49
+ GEPA_TIE_TOLERANCE,
50
+ GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
51
+ )
52
+
53
+
54
+ if TYPE_CHECKING:
55
+ from deepeval.dataset.golden import Golden, ConversationalGolden
56
+
57
+
58
+ class GEPA(BaseAlgorithm):
59
+ """
60
+ GEPA loop with sync/async execution.
61
+
62
+ This runner is intentionally low level and does not know about metrics,
63
+ models, or async configs. It relies on a preconfigured
64
+ Scorer and Rewriter, which are typically constructed by
65
+ the higher-level PromptOptimizer.
66
+
67
+ Parameters
68
+ ----------
69
+ iterations : int
70
+ Total number of GEPA loop iterations (mutation attempts). Default is 5.
71
+ minibatch_size : int
72
+ Number of examples drawn from D_feedback per iteration. Default is 8.
73
+ pareto_size : int
74
+ Size of the Pareto validation subset D_pareto. Default is 3.
75
+ random_seed : int, optional
76
+ RNG seed for reproducibility. If None, derived from time.time_ns().
77
+ tie_breaker : TieBreaker
78
+ Policy for breaking ties. Default is TieBreaker.PREFER_CHILD.
79
+ """
80
+
81
+ name = "GEPA"
82
+ SINGLE_MODULE_ID: ModuleId = "__module__"
83
+ TieBreaker = TieBreaker
84
+
85
+ def __init__(
86
+ self,
87
+ iterations: int = 5,
88
+ minibatch_size: int = 8,
89
+ pareto_size: int = 3,
90
+ random_seed: Optional[int] = None,
91
+ tie_breaker: TieBreaker = TieBreaker.PREFER_CHILD,
92
+ aggregate_instances: Aggregator = mean_of_all,
93
+ scorer: Optional[BaseScorer] = None,
94
+ ) -> None:
95
+ # Validate parameters
96
+ if iterations < 1:
97
+ raise ValueError("iterations must be >= 1")
98
+ if minibatch_size < 1:
99
+ raise ValueError("minibatch_size must be >= 1")
100
+ if pareto_size < 1:
101
+ raise ValueError("pareto_size must be >= 1")
102
+
103
+ self.iterations = iterations
104
+ self.minibatch_size = minibatch_size
105
+ self.pareto_size = pareto_size
106
+ self.tie_breaker = tie_breaker
107
+ self.aggregate_instances = aggregate_instances
108
+ self.scorer = scorer
109
+
110
+ # If no seed provided, use time-based seed
111
+ if random_seed is None:
112
+ random_seed = time.time_ns()
113
+ self.random_seed = random_seed
114
+ self.random_state = random.Random(random_seed)
115
+
116
+ # runtime state to be reset between runs
117
+ self.reset_state()
118
+
119
+ # Status callback set by PromptOptimizer:
120
+ # (kind, step_index, total_steps, detail) -> None
121
+ self.status_callback: Optional[RunnerStatusCallback] = None
122
+
123
+ # Optimizer model used by the rewriter for prompt mutation.
124
+ # Set by PromptOptimizer.
125
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
126
+
127
+ # lazy loaded
128
+ self._rewriter: Optional[Rewriter] = None
129
+
130
+ ##############
131
+ # Public API #
132
+ ##############
133
+
134
+ def execute(
135
+ self,
136
+ prompt: Prompt,
137
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
138
+ ) -> Tuple[Prompt, OptimizationReport]:
139
+ """Synchronous GEPA run from a full list of goldens (splits internally)."""
140
+ total_goldens = len(goldens)
141
+ if total_goldens < 2:
142
+ raise DeepEvalError(
143
+ "GEPA prompt optimization requires at least 2 goldens, but "
144
+ f"received {total_goldens}. Provide at least two goldens to "
145
+ "run the optimizer."
146
+ )
147
+
148
+ self._ensure_scorer()
149
+ self.reset_state()
150
+
151
+ d_feedback, d_pareto = split_goldens(
152
+ goldens, self.pareto_size, random_state=self.random_state
153
+ )
154
+
155
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
156
+ root_prompt_configuration = PromptConfiguration.new(
157
+ prompts=dict(seed_prompts_by_module)
158
+ )
159
+ self._add_prompt_configuration(root_prompt_configuration)
160
+
161
+ accepted_iterations: List[Dict] = []
162
+
163
+ def _one_iteration() -> bool:
164
+ nonlocal accepted_iterations
165
+
166
+ if not d_feedback:
167
+ return False
168
+
169
+ # Seed Pareto scores lazily on first iteration
170
+ if not self.pareto_score_table:
171
+ self.pareto_score_table[root_prompt_configuration.id] = (
172
+ self.scorer.score_pareto(
173
+ root_prompt_configuration, d_pareto
174
+ )
175
+ )
176
+
177
+ # 1. Pick prompt_configuration via Pareto
178
+ parent_prompt_configuration = self._pick_prompt_configuration()
179
+
180
+ # 2. Single module id
181
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
182
+
183
+ # 3. Draw minibatch
184
+ minibatch = self._draw_minibatch(d_feedback)
185
+
186
+ # 4. Feedback
187
+ feedback_text = self.scorer.get_minibatch_feedback(
188
+ parent_prompt_configuration, selected_module_id, minibatch
189
+ )
190
+
191
+ # 5. Rewrite
192
+ child_prompt = self._generate_child_prompt(
193
+ selected_module_id, parent_prompt_configuration, feedback_text
194
+ )
195
+ if child_prompt is None:
196
+ # Child prompt matched parent; skip this iteration.
197
+ return True
198
+
199
+ # 6. Child prompt_configuration
200
+ child_prompt_configuration = self._make_child(
201
+ selected_module_id, parent_prompt_configuration, child_prompt
202
+ )
203
+
204
+ # 7. Evaluate parent/child on minibatch
205
+ parent_score = self.scorer.score_minibatch(
206
+ parent_prompt_configuration, minibatch
207
+ )
208
+ child_score = self.scorer.score_minibatch(
209
+ child_prompt_configuration, minibatch
210
+ )
211
+
212
+ # 8. Acceptance test
213
+ accepted = self._should_accept_child(parent_score, child_score)
214
+ if accepted:
215
+ accepted_iterations.append(
216
+ self._accept_child(
217
+ selected_module_id,
218
+ parent_prompt_configuration,
219
+ child_prompt_configuration,
220
+ d_pareto,
221
+ parent_score,
222
+ child_score,
223
+ )
224
+ )
225
+
226
+ return True
227
+
228
+ self._run_loop_iteration(_one_iteration)
229
+ best = self._best_by_aggregate()
230
+ prompt_config_snapshots = build_prompt_config_snapshots(
231
+ self.prompt_configurations_by_id
232
+ )
233
+ report = OptimizationReport(
234
+ optimization_id=self.optimization_id,
235
+ best_id=best.id,
236
+ accepted_iterations=accepted_iterations,
237
+ pareto_scores=self.pareto_score_table,
238
+ parents=self.parents_by_id,
239
+ prompt_configurations=prompt_config_snapshots,
240
+ )
241
+ return best.prompts[self.SINGLE_MODULE_ID], report
242
+
243
+ async def a_execute(
244
+ self,
245
+ prompt: Prompt,
246
+ goldens: Union[List["Golden"], List["ConversationalGolden"]],
247
+ ) -> Tuple[Prompt, OptimizationReport]:
248
+ """Asynchronous twin of execute_gepa()."""
249
+ total_goldens = len(goldens)
250
+ if total_goldens < 2:
251
+ raise DeepEvalError(
252
+ "GEPA prompt optimization requires at least 2 goldens, but "
253
+ f"received {total_goldens}. Provide at least two goldens to "
254
+ "run the optimizer."
255
+ )
256
+
257
+ self._ensure_scorer()
258
+ self.reset_state()
259
+
260
+ d_feedback, d_pareto = split_goldens(
261
+ goldens, self.pareto_size, random_state=self.random_state
262
+ )
263
+
264
+ seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
265
+ root_prompt_configuration = PromptConfiguration.new(
266
+ prompts=dict(seed_prompts_by_module)
267
+ )
268
+ self._add_prompt_configuration(root_prompt_configuration)
269
+
270
+ accepted_iterations: List[Dict] = []
271
+
272
+ async def _one_iteration() -> bool:
273
+ nonlocal accepted_iterations
274
+
275
+ if not d_feedback:
276
+ return False
277
+
278
+ iter_start = time.perf_counter()
279
+
280
+ # Seed Pareto scores lazily on first iteration
281
+ if not self.pareto_score_table:
282
+ t0 = time.perf_counter()
283
+ self.pareto_score_table[root_prompt_configuration.id] = (
284
+ await self.scorer.a_score_pareto(
285
+ root_prompt_configuration, d_pareto
286
+ )
287
+ )
288
+ print(
289
+ f"[DEBUG] Initial pareto scoring ({len(d_pareto)} goldens): {time.perf_counter() - t0:.2f}s"
290
+ )
291
+
292
+ # 1. Pick prompt_configuration via Pareto
293
+ parent_prompt_configuration = self._pick_prompt_configuration()
294
+
295
+ # 2. Single module id
296
+ selected_module_id: ModuleId = self.SINGLE_MODULE_ID
297
+
298
+ # 3. Draw minibatch
299
+ minibatch = self._draw_minibatch(d_feedback)
300
+ print(f"[DEBUG] Minibatch size: {len(minibatch)}")
301
+
302
+ # 4. Feedback
303
+ t0 = time.perf_counter()
304
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
305
+ parent_prompt_configuration, selected_module_id, minibatch
306
+ )
307
+ print(f"[DEBUG] Get feedback: {time.perf_counter() - t0:.2f}s")
308
+
309
+ # 5. Rewrite
310
+ t0 = time.perf_counter()
311
+ child_prompt = await self._a_generate_child_prompt(
312
+ selected_module_id, parent_prompt_configuration, feedback_text
313
+ )
314
+ print(f"[DEBUG] Rewrite prompt: {time.perf_counter() - t0:.2f}s")
315
+ if child_prompt is None:
316
+ print(f"[DEBUG] Child prompt same as parent, skipping")
317
+ return True
318
+
319
+ # 6. Child prompt_configuration
320
+ child_prompt_configuration = self._make_child(
321
+ selected_module_id, parent_prompt_configuration, child_prompt
322
+ )
323
+
324
+ # 7. Evaluate parent/child on minibatch
325
+ t0 = time.perf_counter()
326
+ parent_score = await self.scorer.a_score_minibatch(
327
+ parent_prompt_configuration, minibatch
328
+ )
329
+ print(
330
+ f"[DEBUG] Score parent on minibatch: {time.perf_counter() - t0:.2f}s (score={parent_score:.4f})"
331
+ )
332
+
333
+ t0 = time.perf_counter()
334
+ child_score = await self.scorer.a_score_minibatch(
335
+ child_prompt_configuration, minibatch
336
+ )
337
+ print(
338
+ f"[DEBUG] Score child on minibatch: {time.perf_counter() - t0:.2f}s (score={child_score:.4f})"
339
+ )
340
+
341
+ # 8. Acceptance test
342
+ accepted = self._should_accept_child(parent_score, child_score)
343
+ print(
344
+ f"[DEBUG] Acceptance: {'ACCEPTED' if accepted else 'REJECTED'}"
345
+ )
346
+ if accepted:
347
+ t0 = time.perf_counter()
348
+ accepted_iterations.append(
349
+ await self._a_accept_child(
350
+ selected_module_id,
351
+ parent_prompt_configuration,
352
+ child_prompt_configuration,
353
+ d_pareto,
354
+ parent_score,
355
+ child_score,
356
+ )
357
+ )
358
+ print(
359
+ f"[DEBUG] Accept child (pareto scoring): {time.perf_counter() - t0:.2f}s"
360
+ )
361
+
362
+ print(
363
+ f"[DEBUG] Total iteration time: {time.perf_counter() - iter_start:.2f}s\n"
364
+ )
365
+ return True
366
+
367
+ await self._a_run_loop_iteration(_one_iteration)
368
+ best = self._best_by_aggregate()
369
+ prompt_config_snapshots = build_prompt_config_snapshots(
370
+ self.prompt_configurations_by_id
371
+ )
372
+ report = OptimizationReport(
373
+ optimization_id=self.optimization_id,
374
+ best_id=best.id,
375
+ accepted_iterations=accepted_iterations,
376
+ pareto_scores=self.pareto_score_table,
377
+ parents=self.parents_by_id,
378
+ prompt_configurations=prompt_config_snapshots,
379
+ )
380
+ return best.prompts[self.SINGLE_MODULE_ID], report
381
+
382
+ ###################
383
+ # State & helpers #
384
+ ###################
385
+
386
+ def reset_state(self) -> None:
387
+ self.optimization_id = str(uuid.uuid4())
388
+ self.prompt_configurations_by_id: Dict[
389
+ PromptConfigurationId, PromptConfiguration
390
+ ] = {}
391
+ self.parents_by_id: Dict[
392
+ PromptConfigurationId, Optional[PromptConfigurationId]
393
+ ] = {}
394
+ self.pareto_score_table: ScoreTable = {}
395
+
396
+ def _ensure_scorer(self) -> None:
397
+ if self.scorer is None:
398
+ raise DeepEvalError(
399
+ "GEPARunner requires a `scorer`. "
400
+ "Construct one (for example, Scorer) in "
401
+ "PromptOptimizer and assign it to `runner.scorer`."
402
+ )
403
+
404
+ def _prompts_equivalent(
405
+ self, old_prompt: Prompt, new_prompt: Prompt
406
+ ) -> bool:
407
+ """
408
+ Compare two Prompts for GEPA acceptance purposes.
409
+
410
+ This is used as:
411
+ if self._prompts_equivalent(old, new):
412
+ # reject child (treat as "no change")
413
+ return None
414
+
415
+ So:
416
+ - Return True: "do not accept this child"
417
+ - Return False: "child is meaningfully different"
418
+
419
+ Rules:
420
+ - If the types must be the same for this check to be meaningful
421
+ - For TEXT: compare text_template with whitespace trimmed
422
+ - For LIST: compare messages_template (length, role, and content,
423
+ with content whitespace trimmed).
424
+ """
425
+
426
+ # LIST prompts: compare messages
427
+ if new_prompt.type == PromptType.LIST:
428
+ old_msgs = old_prompt.messages_template
429
+ new_msgs = new_prompt.messages_template
430
+ if len(old_msgs) != len(new_msgs):
431
+ return False
432
+
433
+ for old_msg, new_msg in zip(old_msgs, new_msgs):
434
+ if old_msg.role != new_msg.role:
435
+ return False
436
+ if (old_msg.content or "").strip() != (
437
+ new_msg.content or ""
438
+ ).strip():
439
+ return False
440
+
441
+ return True
442
+
443
+ # TEXT prompts: compare text_template
444
+ old_txt = (old_prompt.text_template or "").strip()
445
+ new_txt = (new_prompt.text_template or "").strip()
446
+ return new_txt == old_txt
447
+
448
+ def _add_prompt_configuration(
449
+ self, prompt_configuration: PromptConfiguration
450
+ ) -> None:
451
+ self.prompt_configurations_by_id[prompt_configuration.id] = (
452
+ prompt_configuration
453
+ )
454
+ self.parents_by_id[prompt_configuration.id] = (
455
+ prompt_configuration.parent
456
+ )
457
+
458
+ def _best_by_aggregate(self) -> PromptConfiguration:
459
+ totals = {
460
+ prompt_configuration_id: self.aggregate_instances(vector)
461
+ for prompt_configuration_id, vector in self.pareto_score_table.items()
462
+ }
463
+
464
+ chosen, tied, max_val = pick_best_with_ties(
465
+ totals,
466
+ self.parents_by_id,
467
+ random_state=self.random_state,
468
+ tie_tolerance=GEPA_TIE_TOLERANCE,
469
+ policy=self.tie_breaker,
470
+ )
471
+ if self.status_callback is not None and len(tied) > 1:
472
+ msg = (
473
+ f"tie on aggregate={max_val:.4f} among {len(tied)} "
474
+ f"prompt_configurations; using tie_breaker="
475
+ f"{self.tie_breaker.value!r} selected {chosen}. "
476
+ f"To change, set GEPA tie_breaker to one of: "
477
+ f"{[t.value for t in self.TieBreaker]}."
478
+ )
479
+ self.status_callback(
480
+ RunnerStatusType.TIE,
481
+ detail=msg,
482
+ )
483
+
484
+ return self.prompt_configurations_by_id[chosen]
485
+
486
+ def _pick_prompt_configuration(self) -> PromptConfiguration:
487
+ # Log Pareto selection details
488
+ all_candidates = list(self.pareto_score_table.keys())
489
+ print(f"[DEBUG] Pareto Selection:")
490
+ print(f" - Total candidates in pool: {len(all_candidates)}")
491
+
492
+ # Show score table
493
+ print(f" - Score table (per-instance scores):")
494
+ for cid, scores in self.pareto_score_table.items():
495
+ is_root = self.parents_by_id.get(cid) is None
496
+ label = (
497
+ "(root)"
498
+ if is_root
499
+ else f"(child of {self.parents_by_id.get(cid)[:8]}...)"
500
+ )
501
+ mean_score = sum(scores) / len(scores) if scores else 0
502
+ print(
503
+ f" {cid[:8]}... {label}: {[round(s, 3) for s in scores]} (mean={mean_score:.3f})"
504
+ )
505
+
506
+ # Show Pareto frontier
507
+ frontier = pareto_frontier(all_candidates, self.pareto_score_table)
508
+ print(f" - Pareto frontier ({len(frontier)} non-dominated):")
509
+ for cid in frontier:
510
+ print(f" {cid[:8]}...")
511
+
512
+ # Show frequency weights
513
+ freq = frequency_weights(self.pareto_score_table)
514
+ print(f" - Frequency weights (how often each wins an instance):")
515
+ for cid, weight in freq.items():
516
+ print(f" {cid[:8]}...: {weight}")
517
+
518
+ # Do the selection
519
+ selected_prompt_configuration_id = select_prompt_configuration_pareto(
520
+ self.pareto_score_table, random_state=self.random_state
521
+ )
522
+ print(f" - Selected: {selected_prompt_configuration_id[:8]}...\n")
523
+
524
+ return self.prompt_configurations_by_id[
525
+ selected_prompt_configuration_id
526
+ ]
527
+
528
+ def _draw_minibatch(
529
+ self, d_feedback: Union[List["Golden"], List["ConversationalGolden"]]
530
+ ) -> Union[List["Golden"], List["ConversationalGolden"]]:
531
+ # Determine effective minibatch size, bounded by the
532
+ # available feedback set.
533
+ n_feedback = len(d_feedback)
534
+ if n_feedback <= 0:
535
+ return []
536
+
537
+ size = min(self.minibatch_size, n_feedback)
538
+
539
+ return [
540
+ d_feedback[self.random_state.randrange(0, n_feedback)]
541
+ for _ in range(size)
542
+ ]
543
+
544
+ async def _a_generate_child_prompt(
545
+ self,
546
+ selected_module_id: ModuleId,
547
+ parent_prompt_configuration: PromptConfiguration,
548
+ feedback_text: str,
549
+ ) -> Optional[Prompt]:
550
+ old_prompt = parent_prompt_configuration.prompts.get(
551
+ selected_module_id, Prompt(text_template="")
552
+ )
553
+
554
+ new_prompt = await self._rewriter.a_rewrite(
555
+ module_id=selected_module_id,
556
+ old_prompt=old_prompt,
557
+ feedback_text=feedback_text,
558
+ )
559
+
560
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
561
+ old_prompt, new_prompt
562
+ ):
563
+ # don't accept if new prompt is the same as parent
564
+ # or if the type somehow changed
565
+ return None
566
+ return new_prompt
567
+
568
+ def _generate_child_prompt(
569
+ self,
570
+ selected_module_id: ModuleId,
571
+ parent_prompt_configuration: PromptConfiguration,
572
+ feedback_text: str,
573
+ ) -> Optional[Prompt]:
574
+ old_prompt = parent_prompt_configuration.prompts.get(
575
+ selected_module_id, Prompt(text_template="")
576
+ )
577
+
578
+ new_prompt = self._rewriter.rewrite(
579
+ module_id=selected_module_id,
580
+ old_prompt=old_prompt,
581
+ feedback_text=feedback_text,
582
+ )
583
+
584
+ if old_prompt.type != new_prompt.type or self._prompts_equivalent(
585
+ old_prompt, new_prompt
586
+ ):
587
+ # don't accept if new prompt is the same as parent
588
+ # or if the type somehow changed
589
+ return None
590
+ return new_prompt
591
+
592
+ def _make_child(
593
+ self,
594
+ selected_module_id: ModuleId,
595
+ parent_prompt_configuration: PromptConfiguration,
596
+ child_prompt: Prompt,
597
+ ) -> PromptConfiguration:
598
+ child_prompt_configuration = PromptConfiguration.new(
599
+ prompts=dict(parent_prompt_configuration.prompts),
600
+ parent=parent_prompt_configuration.id,
601
+ )
602
+ child_prompt_configuration.prompts[selected_module_id] = child_prompt
603
+ return child_prompt_configuration
604
+
605
+ def _should_accept_child(
606
+ self, parent_score: float, child_score: float
607
+ ) -> bool:
608
+ jitter = 1e-6
609
+ return child_score >= parent_score + max(GEPA_MIN_DELTA, jitter)
610
+
611
+ def _accept_child(
612
+ self,
613
+ selected_module_id: ModuleId,
614
+ parent_prompt_configuration: PromptConfiguration,
615
+ child_prompt_configuration: PromptConfiguration,
616
+ d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
617
+ parent_score: float,
618
+ child_score: float,
619
+ ) -> AcceptedIterationDict:
620
+ self._add_prompt_configuration(child_prompt_configuration)
621
+ self.pareto_score_table[child_prompt_configuration.id] = (
622
+ self.scorer.score_pareto(child_prompt_configuration, d_pareto)
623
+ )
624
+
625
+ return AcceptedIterationDict(
626
+ parent=parent_prompt_configuration.id,
627
+ child=child_prompt_configuration.id,
628
+ module=selected_module_id,
629
+ before=parent_score,
630
+ after=child_score,
631
+ )
632
+
633
+ async def _a_accept_child(
634
+ self,
635
+ selected_module_id: ModuleId,
636
+ parent_prompt_configuration: PromptConfiguration,
637
+ child_prompt_configuration: PromptConfiguration,
638
+ d_pareto: Union[List["Golden"], List["ConversationalGolden"]],
639
+ parent_score: float,
640
+ child_score: float,
641
+ ) -> AcceptedIterationDict:
642
+ self._add_prompt_configuration(child_prompt_configuration)
643
+ self.pareto_score_table[child_prompt_configuration.id] = (
644
+ await self.scorer.a_score_pareto(
645
+ child_prompt_configuration, d_pareto
646
+ )
647
+ )
648
+
649
+ return AcceptedIterationDict(
650
+ parent=parent_prompt_configuration.id,
651
+ child=child_prompt_configuration.id,
652
+ module=selected_module_id,
653
+ before=parent_score,
654
+ after=child_score,
655
+ )
656
+
657
+ def _update_progress(
658
+ self,
659
+ total_iterations: int,
660
+ iteration: int,
661
+ remaining_iterations: int,
662
+ ):
663
+ if self.status_callback is not None:
664
+ detail = (
665
+ f"(iterations={total_iterations}) "
666
+ f"• iteration {iteration}/{total_iterations} "
667
+ f"• remaining={remaining_iterations}"
668
+ )
669
+ self.status_callback(
670
+ RunnerStatusType.PROGRESS,
671
+ step_index=iteration,
672
+ total_steps=total_iterations,
673
+ detail=detail,
674
+ )
675
+
676
+ def _update_error(
677
+ self, total_iterations: int, iteration: int, exc: Exception
678
+ ):
679
+ # Report a user facing error event
680
+ if self.status_callback is not None:
681
+ detail = (
682
+ f"(iterations={total_iterations}) "
683
+ f"• error {exc.__class__.__name__}: {exc} "
684
+ f"• halted at iteration {iteration}"
685
+ )
686
+ self.status_callback(
687
+ RunnerStatusType.ERROR,
688
+ step_index=iteration,
689
+ total_steps=total_iterations,
690
+ detail=detail,
691
+ )
692
+
693
+ def _run_loop_iteration(
694
+ self,
695
+ gepa_iteration: Callable[[], bool],
696
+ ) -> None:
697
+ total_iterations = self.iterations
698
+ remaining_iterations = total_iterations
699
+ iteration = 0
700
+ self._update_progress(total_iterations, iteration, remaining_iterations)
701
+ while remaining_iterations > 0:
702
+ iteration += 1
703
+ try:
704
+ ok = gepa_iteration()
705
+ except Exception as exc:
706
+ # Report a user facing error event and halt optimization.
707
+ self._update_error(total_iterations, iteration, exc)
708
+ break
709
+ if not ok:
710
+ break
711
+ remaining_iterations -= 1
712
+ self._update_progress(
713
+ total_iterations, iteration, remaining_iterations
714
+ )
715
+
716
+ async def _a_run_loop_iteration(
717
+ self,
718
+ a_gepa_iteration: Callable[[], Awaitable[bool]],
719
+ ) -> None:
720
+ total_iterations = self.iterations
721
+ remaining_iterations = total_iterations
722
+ iteration = 0
723
+ self._update_progress(total_iterations, iteration, remaining_iterations)
724
+ while remaining_iterations > 0:
725
+ iteration += 1
726
+ try:
727
+ ok = await a_gepa_iteration()
728
+ except Exception as exc:
729
+ # Report a user facing error event and halt optimization.
730
+ self._update_error(total_iterations, iteration, exc)
731
+ break
732
+ if not ok:
733
+ break
734
+ remaining_iterations -= 1
735
+ self._update_progress(
736
+ total_iterations, iteration, remaining_iterations
737
+ )