deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -14,86 +14,118 @@ from typing import (
14
14
  Optional,
15
15
  )
16
16
 
17
+ from deepeval.models.base_model import DeepEvalBaseLLM
18
+
17
19
  from deepeval.errors import DeepEvalError
18
- from deepeval.optimization.aggregates import Aggregator, mean_of_all
19
- from deepeval.optimization.types import (
20
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
21
+ from deepeval.optimizer.types import (
20
22
  AcceptedIterationDict,
21
23
  PromptConfiguration,
22
24
  PromptConfigurationId,
23
25
  ModuleId,
24
26
  ScoreTable,
25
- ScoringAdapter,
26
- OptimizationResult,
27
+ OptimizationReport,
27
28
  RunnerStatusType,
28
- RunnerStatusCallbackProtocol,
29
+ RunnerStatusCallback,
29
30
  )
30
- from deepeval.optimization.utils import (
31
+ from deepeval.optimizer.scorer.base import BaseScorer
32
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
33
+ from deepeval.optimizer.utils import (
31
34
  split_goldens,
32
35
  build_prompt_config_snapshots,
33
36
  )
34
- from deepeval.optimization.policies import (
37
+ from deepeval.optimizer.policies import (
35
38
  pick_best_with_ties,
36
39
  select_prompt_configuration_pareto,
40
+ frequency_weights,
41
+ pareto_frontier,
37
42
  )
38
43
  from deepeval.prompt.api import PromptType
39
44
  from deepeval.prompt.prompt import Prompt
40
- from deepeval.optimization.mutations.prompt_rewriter import (
41
- PromptRewriter,
45
+ from deepeval.optimizer.rewriter import Rewriter
46
+ from deepeval.optimizer.policies import TieBreaker
47
+ from deepeval.optimizer.algorithms.configs import (
48
+ GEPA_MIN_DELTA,
49
+ GEPA_TIE_TOLERANCE,
50
+ GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
42
51
  )
43
- from .configs import GEPAConfig
44
52
 
45
53
 
46
54
  if TYPE_CHECKING:
47
55
  from deepeval.dataset.golden import Golden, ConversationalGolden
48
56
 
49
57
 
50
- class GEPARunner:
58
+ class GEPA(BaseAlgorithm):
51
59
  """
52
60
  GEPA loop with sync/async execution.
53
61
 
54
62
  This runner is intentionally low level and does not know about metrics,
55
63
  models, or async configs. It relies on a preconfigured
56
- ScoringAdapter and PromptRewriter, which are typically constructed by
64
+ Scorer and Rewriter, which are typically constructed by
57
65
  the higher-level PromptOptimizer.
66
+
67
+ Parameters
68
+ ----------
69
+ iterations : int
70
+ Total number of GEPA loop iterations (mutation attempts). Default is 5.
71
+ minibatch_size : int
72
+ Number of examples drawn from D_feedback per iteration. Default is 8.
73
+ pareto_size : int
74
+ Size of the Pareto validation subset D_pareto. Default is 3.
75
+ random_seed : int, optional
76
+ RNG seed for reproducibility. If None, derived from time.time_ns().
77
+ tie_breaker : TieBreaker
78
+ Policy for breaking ties. Default is TieBreaker.PREFER_CHILD.
58
79
  """
59
80
 
81
+ name = "GEPA"
60
82
  SINGLE_MODULE_ID: ModuleId = "__module__"
83
+ TieBreaker = TieBreaker
61
84
 
62
85
  def __init__(
63
86
  self,
64
- *,
65
- config: GEPAConfig,
87
+ iterations: int = 5,
88
+ minibatch_size: int = 8,
89
+ pareto_size: int = 3,
90
+ random_seed: Optional[int] = None,
91
+ tie_breaker: TieBreaker = TieBreaker.PREFER_CHILD,
66
92
  aggregate_instances: Aggregator = mean_of_all,
67
- scoring_adapter: Optional[ScoringAdapter] = None,
93
+ scorer: Optional[BaseScorer] = None,
68
94
  ) -> None:
69
- self.config = config
95
+ # Validate parameters
96
+ if iterations < 1:
97
+ raise ValueError("iterations must be >= 1")
98
+ if minibatch_size < 1:
99
+ raise ValueError("minibatch_size must be >= 1")
100
+ if pareto_size < 1:
101
+ raise ValueError("pareto_size must be >= 1")
102
+
103
+ self.iterations = iterations
104
+ self.minibatch_size = minibatch_size
105
+ self.pareto_size = pareto_size
106
+ self.tie_breaker = tie_breaker
70
107
  self.aggregate_instances = aggregate_instances
71
- self.scoring_adapter = scoring_adapter
108
+ self.scorer = scorer
72
109
 
73
- # random seeded from config is used for splits, sampling, and tie-breaking.
74
- self.random_state = random.Random(config.random_seed)
110
+ # If no seed provided, use time-based seed
111
+ if random_seed is None:
112
+ random_seed = time.time_ns()
113
+ self.random_seed = random_seed
114
+ self.random_state = random.Random(random_seed)
75
115
 
76
116
  # runtime state to be reset between runs
77
117
  self.reset_state()
78
118
 
79
119
  # Status callback set by PromptOptimizer:
80
120
  # (kind, step_index, total_steps, detail) -> None
81
- self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
82
-
83
- # Model callback used by the rewriter set by PromptOptimizer.
84
- self.model_callback: Optional[
85
- Callable[
86
- ...,
87
- Union[
88
- str,
89
- Dict,
90
- Tuple[Union[str, Dict], float],
91
- ],
92
- ]
93
- ] = None
121
+ self.status_callback: Optional[RunnerStatusCallback] = None
122
+
123
+ # Optimizer model used by the rewriter for prompt mutation.
124
+ # Set by PromptOptimizer.
125
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
94
126
 
95
127
  # lazy loaded
96
- self._rewriter: Optional[PromptRewriter] = None
128
+ self._rewriter: Optional[Rewriter] = None
97
129
 
98
130
  ##############
99
131
  # Public API #
@@ -101,10 +133,9 @@ class GEPARunner:
101
133
 
102
134
  def execute(
103
135
  self,
104
- *,
105
136
  prompt: Prompt,
106
137
  goldens: Union[List["Golden"], List["ConversationalGolden"]],
107
- ) -> Tuple[Prompt, Dict]:
138
+ ) -> Tuple[Prompt, OptimizationReport]:
108
139
  """Synchronous GEPA run from a full list of goldens (splits internally)."""
109
140
  total_goldens = len(goldens)
110
141
  if total_goldens < 2:
@@ -114,12 +145,11 @@ class GEPARunner:
114
145
  "run the optimizer."
115
146
  )
116
147
 
117
- self._ensure_scoring_adapter()
118
- self._ensure_rewriter()
148
+ self._ensure_scorer()
119
149
  self.reset_state()
120
150
 
121
151
  d_feedback, d_pareto = split_goldens(
122
- goldens, self.config.pareto_size, random_state=self.random_state
152
+ goldens, self.pareto_size, random_state=self.random_state
123
153
  )
124
154
 
125
155
  seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
@@ -139,7 +169,7 @@ class GEPARunner:
139
169
  # Seed Pareto scores lazily on first iteration
140
170
  if not self.pareto_score_table:
141
171
  self.pareto_score_table[root_prompt_configuration.id] = (
142
- self.scoring_adapter.score_on_pareto(
172
+ self.scorer.score_pareto(
143
173
  root_prompt_configuration, d_pareto
144
174
  )
145
175
  )
@@ -154,7 +184,7 @@ class GEPARunner:
154
184
  minibatch = self._draw_minibatch(d_feedback)
155
185
 
156
186
  # 4. Feedback
157
- feedback_text = self.scoring_adapter.minibatch_feedback(
187
+ feedback_text = self.scorer.get_minibatch_feedback(
158
188
  parent_prompt_configuration, selected_module_id, minibatch
159
189
  )
160
190
 
@@ -172,15 +202,16 @@ class GEPARunner:
172
202
  )
173
203
 
174
204
  # 7. Evaluate parent/child on minibatch
175
- parent_score = self.scoring_adapter.minibatch_score(
205
+ parent_score = self.scorer.score_minibatch(
176
206
  parent_prompt_configuration, minibatch
177
207
  )
178
- child_score = self.scoring_adapter.minibatch_score(
208
+ child_score = self.scorer.score_minibatch(
179
209
  child_prompt_configuration, minibatch
180
210
  )
181
211
 
182
212
  # 8. Acceptance test
183
- if self._should_accept_child(parent_score, child_score):
213
+ accepted = self._should_accept_child(parent_score, child_score)
214
+ if accepted:
184
215
  accepted_iterations.append(
185
216
  self._accept_child(
186
217
  selected_module_id,
@@ -199,7 +230,7 @@ class GEPARunner:
199
230
  prompt_config_snapshots = build_prompt_config_snapshots(
200
231
  self.prompt_configurations_by_id
201
232
  )
202
- report = OptimizationResult(
233
+ report = OptimizationReport(
203
234
  optimization_id=self.optimization_id,
204
235
  best_id=best.id,
205
236
  accepted_iterations=accepted_iterations,
@@ -207,14 +238,13 @@ class GEPARunner:
207
238
  parents=self.parents_by_id,
208
239
  prompt_configurations=prompt_config_snapshots,
209
240
  )
210
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
241
+ return best.prompts[self.SINGLE_MODULE_ID], report
211
242
 
212
243
  async def a_execute(
213
244
  self,
214
- *,
215
245
  prompt: Prompt,
216
246
  goldens: Union[List["Golden"], List["ConversationalGolden"]],
217
- ) -> Tuple[Prompt, Dict]:
247
+ ) -> Tuple[Prompt, OptimizationReport]:
218
248
  """Asynchronous twin of execute_gepa()."""
219
249
  total_goldens = len(goldens)
220
250
  if total_goldens < 2:
@@ -224,12 +254,11 @@ class GEPARunner:
224
254
  "run the optimizer."
225
255
  )
226
256
 
227
- self._ensure_scoring_adapter()
228
- self._ensure_rewriter()
257
+ self._ensure_scorer()
229
258
  self.reset_state()
230
259
 
231
260
  d_feedback, d_pareto = split_goldens(
232
- goldens, self.config.pareto_size, random_state=self.random_state
261
+ goldens, self.pareto_size, random_state=self.random_state
233
262
  )
234
263
 
235
264
  seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
@@ -246,13 +275,19 @@ class GEPARunner:
246
275
  if not d_feedback:
247
276
  return False
248
277
 
278
+ iter_start = time.perf_counter()
279
+
249
280
  # Seed Pareto scores lazily on first iteration
250
281
  if not self.pareto_score_table:
282
+ t0 = time.perf_counter()
251
283
  self.pareto_score_table[root_prompt_configuration.id] = (
252
- await self.scoring_adapter.a_score_on_pareto(
284
+ await self.scorer.a_score_pareto(
253
285
  root_prompt_configuration, d_pareto
254
286
  )
255
287
  )
288
+ print(
289
+ f"[DEBUG] Initial pareto scoring ({len(d_pareto)} goldens): {time.perf_counter() - t0:.2f}s"
290
+ )
256
291
 
257
292
  # 1. Pick prompt_configuration via Pareto
258
293
  parent_prompt_configuration = self._pick_prompt_configuration()
@@ -262,18 +297,23 @@ class GEPARunner:
262
297
 
263
298
  # 3. Draw minibatch
264
299
  minibatch = self._draw_minibatch(d_feedback)
300
+ print(f"[DEBUG] Minibatch size: {len(minibatch)}")
265
301
 
266
302
  # 4. Feedback
267
- feedback_text = await self.scoring_adapter.a_minibatch_feedback(
303
+ t0 = time.perf_counter()
304
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
268
305
  parent_prompt_configuration, selected_module_id, minibatch
269
306
  )
307
+ print(f"[DEBUG] Get feedback: {time.perf_counter() - t0:.2f}s")
270
308
 
271
309
  # 5. Rewrite
310
+ t0 = time.perf_counter()
272
311
  child_prompt = await self._a_generate_child_prompt(
273
312
  selected_module_id, parent_prompt_configuration, feedback_text
274
313
  )
314
+ print(f"[DEBUG] Rewrite prompt: {time.perf_counter() - t0:.2f}s")
275
315
  if child_prompt is None:
276
- # Child prompt matched parent; skip this iteration.
316
+ print(f"[DEBUG] Child prompt same as parent, skipping")
277
317
  return True
278
318
 
279
319
  # 6. Child prompt_configuration
@@ -282,15 +322,29 @@ class GEPARunner:
282
322
  )
283
323
 
284
324
  # 7. Evaluate parent/child on minibatch
285
- parent_score = await self.scoring_adapter.a_minibatch_score(
325
+ t0 = time.perf_counter()
326
+ parent_score = await self.scorer.a_score_minibatch(
286
327
  parent_prompt_configuration, minibatch
287
328
  )
288
- child_score = await self.scoring_adapter.a_minibatch_score(
329
+ print(
330
+ f"[DEBUG] Score parent on minibatch: {time.perf_counter() - t0:.2f}s (score={parent_score:.4f})"
331
+ )
332
+
333
+ t0 = time.perf_counter()
334
+ child_score = await self.scorer.a_score_minibatch(
289
335
  child_prompt_configuration, minibatch
290
336
  )
337
+ print(
338
+ f"[DEBUG] Score child on minibatch: {time.perf_counter() - t0:.2f}s (score={child_score:.4f})"
339
+ )
291
340
 
292
341
  # 8. Acceptance test
293
- if self._should_accept_child(parent_score, child_score):
342
+ accepted = self._should_accept_child(parent_score, child_score)
343
+ print(
344
+ f"[DEBUG] Acceptance: {'ACCEPTED' if accepted else 'REJECTED'}"
345
+ )
346
+ if accepted:
347
+ t0 = time.perf_counter()
294
348
  accepted_iterations.append(
295
349
  await self._a_accept_child(
296
350
  selected_module_id,
@@ -301,6 +355,13 @@ class GEPARunner:
301
355
  child_score,
302
356
  )
303
357
  )
358
+ print(
359
+ f"[DEBUG] Accept child (pareto scoring): {time.perf_counter() - t0:.2f}s"
360
+ )
361
+
362
+ print(
363
+ f"[DEBUG] Total iteration time: {time.perf_counter() - iter_start:.2f}s\n"
364
+ )
304
365
  return True
305
366
 
306
367
  await self._a_run_loop_iteration(_one_iteration)
@@ -308,7 +369,7 @@ class GEPARunner:
308
369
  prompt_config_snapshots = build_prompt_config_snapshots(
309
370
  self.prompt_configurations_by_id
310
371
  )
311
- report = OptimizationResult(
372
+ report = OptimizationReport(
312
373
  optimization_id=self.optimization_id,
313
374
  best_id=best.id,
314
375
  accepted_iterations=accepted_iterations,
@@ -316,7 +377,7 @@ class GEPARunner:
316
377
  parents=self.parents_by_id,
317
378
  prompt_configurations=prompt_config_snapshots,
318
379
  )
319
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
380
+ return best.prompts[self.SINGLE_MODULE_ID], report
320
381
 
321
382
  ###################
322
383
  # State & helpers #
@@ -332,23 +393,14 @@ class GEPARunner:
332
393
  ] = {}
333
394
  self.pareto_score_table: ScoreTable = {}
334
395
 
335
- def _ensure_scoring_adapter(self) -> None:
336
- if self.scoring_adapter is None:
396
+ def _ensure_scorer(self) -> None:
397
+ if self.scorer is None:
337
398
  raise DeepEvalError(
338
- "GEPARunner requires a `scoring_adapter`. "
339
- "Construct one (for example, DeepEvalScoringAdapter) in "
340
- "PromptOptimizer and assign it to `runner.scoring_adapter`."
399
+ "GEPARunner requires a `scorer`. "
400
+ "Construct one (for example, Scorer) in "
401
+ "PromptOptimizer and assign it to `runner.scorer`."
341
402
  )
342
403
 
343
- def _ensure_rewriter(self) -> None:
344
- if self._rewriter is not None:
345
- return
346
-
347
- # For now, always use the basic PromptRewriter. Additional
348
- # variants (e.g. for GEPA Alg. 4 crossover) can be introduced
349
- # later
350
- self._rewriter = PromptRewriter()
351
-
352
404
  def _prompts_equivalent(
353
405
  self, old_prompt: Prompt, new_prompt: Prompt
354
406
  ) -> bool:
@@ -413,17 +465,16 @@ class GEPARunner:
413
465
  totals,
414
466
  self.parents_by_id,
415
467
  random_state=self.random_state,
416
- tie_tolerance=float(self.config.tie_tolerance),
417
- policy=self.config.tie_breaker,
468
+ tie_tolerance=GEPA_TIE_TOLERANCE,
469
+ policy=self.tie_breaker,
418
470
  )
419
471
  if self.status_callback is not None and len(tied) > 1:
420
472
  msg = (
421
473
  f"tie on aggregate={max_val:.4f} among {len(tied)} "
422
474
  f"prompt_configurations; using tie_breaker="
423
- f"{self.config.tie_breaker.value!r} selected {chosen}. "
424
- f"To change, set GEPAConfig.tie_breaker to one of: "
425
- f"{[t.value for t in self.config.TieBreaker]} "
426
- f"(tie_tolerance={float(self.config.tie_tolerance):g})."
475
+ f"{self.tie_breaker.value!r} selected {chosen}. "
476
+ f"To change, set GEPA tie_breaker to one of: "
477
+ f"{[t.value for t in self.TieBreaker]}."
427
478
  )
428
479
  self.status_callback(
429
480
  RunnerStatusType.TIE,
@@ -433,9 +484,43 @@ class GEPARunner:
433
484
  return self.prompt_configurations_by_id[chosen]
434
485
 
435
486
  def _pick_prompt_configuration(self) -> PromptConfiguration:
487
+ # Log Pareto selection details
488
+ all_candidates = list(self.pareto_score_table.keys())
489
+ print(f"[DEBUG] Pareto Selection:")
490
+ print(f" - Total candidates in pool: {len(all_candidates)}")
491
+
492
+ # Show score table
493
+ print(f" - Score table (per-instance scores):")
494
+ for cid, scores in self.pareto_score_table.items():
495
+ is_root = self.parents_by_id.get(cid) is None
496
+ label = (
497
+ "(root)"
498
+ if is_root
499
+ else f"(child of {self.parents_by_id.get(cid)[:8]}...)"
500
+ )
501
+ mean_score = sum(scores) / len(scores) if scores else 0
502
+ print(
503
+ f" {cid[:8]}... {label}: {[round(s, 3) for s in scores]} (mean={mean_score:.3f})"
504
+ )
505
+
506
+ # Show Pareto frontier
507
+ frontier = pareto_frontier(all_candidates, self.pareto_score_table)
508
+ print(f" - Pareto frontier ({len(frontier)} non-dominated):")
509
+ for cid in frontier:
510
+ print(f" {cid[:8]}...")
511
+
512
+ # Show frequency weights
513
+ freq = frequency_weights(self.pareto_score_table)
514
+ print(f" - Frequency weights (how often each wins an instance):")
515
+ for cid, weight in freq.items():
516
+ print(f" {cid[:8]}...: {weight}")
517
+
518
+ # Do the selection
436
519
  selected_prompt_configuration_id = select_prompt_configuration_pareto(
437
520
  self.pareto_score_table, random_state=self.random_state
438
521
  )
522
+ print(f" - Selected: {selected_prompt_configuration_id[:8]}...\n")
523
+
439
524
  return self.prompt_configurations_by_id[
440
525
  selected_prompt_configuration_id
441
526
  ]
@@ -443,25 +528,13 @@ class GEPARunner:
443
528
  def _draw_minibatch(
444
529
  self, d_feedback: Union[List["Golden"], List["ConversationalGolden"]]
445
530
  ) -> Union[List["Golden"], List["ConversationalGolden"]]:
446
- # Determine effective minibatch size from GEPAConfig, bounded by the
531
+ # Determine effective minibatch size, bounded by the
447
532
  # available feedback set.
448
533
  n_feedback = len(d_feedback)
449
534
  if n_feedback <= 0:
450
535
  return []
451
536
 
452
- if self.config.minibatch_size is not None:
453
- size = self.config.minibatch_size
454
- else:
455
- # Dynamic sizing from ratio, bounded between min and max.
456
- dynamic = max(
457
- 1, int(round(n_feedback * self.config.minibatch_ratio))
458
- )
459
- size = max(
460
- self.config.minibatch_min_size,
461
- min(dynamic, self.config.minibatch_max_size),
462
- )
463
-
464
- size = max(1, min(size, n_feedback))
537
+ size = min(self.minibatch_size, n_feedback)
465
538
 
466
539
  return [
467
540
  d_feedback[self.random_state.randrange(0, n_feedback)]
@@ -479,7 +552,6 @@ class GEPARunner:
479
552
  )
480
553
 
481
554
  new_prompt = await self._rewriter.a_rewrite(
482
- model_callback=self.model_callback,
483
555
  module_id=selected_module_id,
484
556
  old_prompt=old_prompt,
485
557
  feedback_text=feedback_text,
@@ -504,7 +576,6 @@ class GEPARunner:
504
576
  )
505
577
 
506
578
  new_prompt = self._rewriter.rewrite(
507
- model_callback=self.model_callback,
508
579
  module_id=selected_module_id,
509
580
  old_prompt=old_prompt,
510
581
  feedback_text=feedback_text,
@@ -535,7 +606,7 @@ class GEPARunner:
535
606
  self, parent_score: float, child_score: float
536
607
  ) -> bool:
537
608
  jitter = 1e-6
538
- return child_score >= parent_score + max(self.config.min_delta, jitter)
609
+ return child_score >= parent_score + max(GEPA_MIN_DELTA, jitter)
539
610
 
540
611
  def _accept_child(
541
612
  self,
@@ -548,9 +619,7 @@ class GEPARunner:
548
619
  ) -> AcceptedIterationDict:
549
620
  self._add_prompt_configuration(child_prompt_configuration)
550
621
  self.pareto_score_table[child_prompt_configuration.id] = (
551
- self.scoring_adapter.score_on_pareto(
552
- child_prompt_configuration, d_pareto
553
- )
622
+ self.scorer.score_pareto(child_prompt_configuration, d_pareto)
554
623
  )
555
624
 
556
625
  return AcceptedIterationDict(
@@ -572,7 +641,7 @@ class GEPARunner:
572
641
  ) -> AcceptedIterationDict:
573
642
  self._add_prompt_configuration(child_prompt_configuration)
574
643
  self.pareto_score_table[child_prompt_configuration.id] = (
575
- await self.scoring_adapter.a_score_on_pareto(
644
+ await self.scorer.a_score_pareto(
576
645
  child_prompt_configuration, d_pareto
577
646
  )
578
647
  )
@@ -590,13 +659,12 @@ class GEPARunner:
590
659
  total_iterations: int,
591
660
  iteration: int,
592
661
  remaining_iterations: int,
593
- elapsed: float,
594
662
  ):
595
663
  if self.status_callback is not None:
596
664
  detail = (
597
665
  f"(iterations={total_iterations}) "
598
666
  f"• iteration {iteration}/{total_iterations} "
599
- f"• {elapsed:.2f}s • remaining={remaining_iterations}"
667
+ f"• remaining={remaining_iterations}"
600
668
  )
601
669
  self.status_callback(
602
670
  RunnerStatusType.PROGRESS,
@@ -626,52 +694,44 @@ class GEPARunner:
626
694
  self,
627
695
  gepa_iteration: Callable[[], bool],
628
696
  ) -> None:
629
- total_iterations = self.config.iterations
697
+ total_iterations = self.iterations
630
698
  remaining_iterations = total_iterations
631
699
  iteration = 0
632
- self._update_progress(
633
- total_iterations, iteration, remaining_iterations, 0
634
- )
700
+ self._update_progress(total_iterations, iteration, remaining_iterations)
635
701
  while remaining_iterations > 0:
636
702
  iteration += 1
637
- start_time = time.perf_counter()
638
703
  try:
639
704
  ok = gepa_iteration()
640
705
  except Exception as exc:
641
706
  # Report a user facing error event and halt optimization.
642
707
  self._update_error(total_iterations, iteration, exc)
643
708
  break
644
- elapsed = time.perf_counter() - start_time
645
709
  if not ok:
646
710
  break
647
711
  remaining_iterations -= 1
648
712
  self._update_progress(
649
- total_iterations, iteration, remaining_iterations, elapsed
713
+ total_iterations, iteration, remaining_iterations
650
714
  )
651
715
 
652
716
  async def _a_run_loop_iteration(
653
717
  self,
654
718
  a_gepa_iteration: Callable[[], Awaitable[bool]],
655
719
  ) -> None:
656
- total_iterations = self.config.iterations
720
+ total_iterations = self.iterations
657
721
  remaining_iterations = total_iterations
658
722
  iteration = 0
659
- self._update_progress(
660
- total_iterations, iteration, remaining_iterations, 0
661
- )
723
+ self._update_progress(total_iterations, iteration, remaining_iterations)
662
724
  while remaining_iterations > 0:
663
725
  iteration += 1
664
- start_time = time.perf_counter()
665
726
  try:
666
727
  ok = await a_gepa_iteration()
667
728
  except Exception as exc:
668
729
  # Report a user facing error event and halt optimization.
669
730
  self._update_error(total_iterations, iteration, exc)
670
731
  break
671
- elapsed = time.perf_counter() - start_time
672
732
  if not ok:
673
733
  break
674
734
  remaining_iterations -= 1
675
735
  self._update_progress(
676
- total_iterations, iteration, remaining_iterations, elapsed
736
+ total_iterations, iteration, remaining_iterations
677
737
  )
@@ -0,0 +1,17 @@
1
+ from .miprov2 import MIPROV2
2
+ from .proposer import InstructionProposer
3
+ from .bootstrapper import (
4
+ Demo,
5
+ DemoSet,
6
+ DemoBootstrapper,
7
+ render_prompt_with_demos,
8
+ )
9
+
10
+ __all__ = [
11
+ "MIPROV2",
12
+ "InstructionProposer",
13
+ "Demo",
14
+ "DemoSet",
15
+ "DemoBootstrapper",
16
+ "render_prompt_with_demos",
17
+ ]