deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -30,59 +30,109 @@ from typing import (
30
30
  Union,
31
31
  )
32
32
 
33
+ from deepeval.models.base_model import DeepEvalBaseLLM
34
+
33
35
  from deepeval.errors import DeepEvalError
34
36
  from deepeval.dataset.golden import ConversationalGolden, Golden
35
- from deepeval.optimization.aggregates import Aggregator, mean_of_all
36
- from deepeval.optimization.types import (
37
+ from deepeval.optimizer.utils import Aggregator, mean_of_all
38
+ from deepeval.optimizer.types import (
37
39
  AcceptedIterationDict,
38
40
  ModuleId,
39
- OptimizationResult,
41
+ OptimizationReport,
40
42
  PromptConfiguration,
41
43
  PromptConfigurationId,
42
- RunnerStatusCallbackProtocol,
44
+ RunnerStatusCallback,
43
45
  RunnerStatusType,
44
46
  ScoreTable,
45
- ScoringAdapter,
46
47
  )
47
- from deepeval.optimization.utils import build_prompt_config_snapshots
48
+ from deepeval.optimizer.scorer.base import BaseScorer
49
+ from deepeval.optimizer.algorithms.base import BaseAlgorithm
50
+ from deepeval.optimizer.utils import build_prompt_config_snapshots
48
51
  from deepeval.prompt.api import PromptType
49
52
  from deepeval.prompt.prompt import Prompt
50
- from deepeval.optimization.mutations.prompt_rewriter import PromptRewriter
53
+ from deepeval.optimizer.rewriter import Rewriter
51
54
 
52
- from .configs import SIMBAConfig
53
- from .types import SIMBAStrategy
55
+ from deepeval.optimizer.algorithms.configs import (
56
+ MIPROV2_MIN_DELTA,
57
+ MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
58
+ SIMBA_DEMO_INPUT_MAX_CHARS,
59
+ )
60
+ from deepeval.optimizer.algorithms.simba.types import SIMBAStrategy
54
61
 
55
62
 
56
- class SIMBARunner:
63
+ class SIMBA(BaseAlgorithm):
57
64
  """
58
65
  SIMBA-style cooperative prompt optimization loop with sync/async execution.
59
66
 
60
67
  This runner is intentionally low level and does not know about metrics,
61
- models, or async configs. It relies on a preconfigured ScoringAdapter and
62
- PromptRewriter, which are typically constructed by PromptOptimizer.
63
-
64
- - Optimizes a single Prompt (instruction) against a list of Goldens.
65
- - Uses mini-batches of goldens for trial scoring and epsilon-greedy
66
- selection over prompt candidates based on mean minibatch scores.
67
- - At each iteration, proposes multiple child prompts using SIMBA-style
68
- edit strategies (APPEND_DEMO and APPEND_RULE) by passing different
69
- instructions into the PromptRewriter.
68
+ models, or async configs. It relies on a preconfigured Scorer and
69
+ Rewriter, which are typically constructed by PromptOptimizer.
70
+
71
+ Parameters
72
+ ----------
73
+ iterations : int
74
+ Total number of optimization trials. Default is 5.
75
+ minibatch_size : int
76
+ Number of examples drawn per iteration. Default is 8.
77
+ random_seed : int, optional
78
+ RNG seed for reproducibility. If None, derived from time.time_ns().
79
+ exploration_probability : float
80
+ Epsilon greedy exploration rate. Default is 0.2.
81
+ full_eval_every : int, optional
82
+ Fully evaluate best candidate every N trials. Default is 5.
83
+ population_size : int
84
+ Maximum number of candidates in the pool. Default is 4.
85
+ proposals_per_step : int
86
+ Number of child prompts proposed per iteration. Default is 4.
87
+ max_demos_per_proposal : int
88
+ Maximum demos from minibatch for APPEND_DEMO strategy. Default is 3.
70
89
  """
71
90
 
91
+ name = "SIMBA"
72
92
  SINGLE_MODULE_ID: ModuleId = "__module__"
73
93
 
74
94
  def __init__(
75
95
  self,
76
- *,
77
- config: SIMBAConfig,
96
+ iterations: int = 5,
97
+ minibatch_size: int = 8,
98
+ random_seed: Optional[int] = None,
99
+ exploration_probability: float = 0.2,
100
+ full_eval_every: Optional[int] = 5,
101
+ population_size: int = 4,
102
+ proposals_per_step: int = 4,
103
+ max_demos_per_proposal: int = 3,
78
104
  aggregate_instances: Aggregator = mean_of_all,
79
- scoring_adapter: Optional[ScoringAdapter] = None,
105
+ scorer: Optional[BaseScorer] = None,
80
106
  ) -> None:
81
- self.config = config
107
+ # Validate parameters
108
+ if iterations < 1:
109
+ raise ValueError("iterations must be >= 1")
110
+ if minibatch_size < 1:
111
+ raise ValueError("minibatch_size must be >= 1")
112
+ if exploration_probability < 0.0 or exploration_probability > 1.0:
113
+ raise ValueError(
114
+ "exploration_probability must be >= 0.0 and <= 1.0"
115
+ )
116
+ if full_eval_every is not None and full_eval_every < 1:
117
+ raise ValueError("full_eval_every must be >= 1")
118
+ if population_size < 1:
119
+ raise ValueError("population_size must be >= 1")
120
+ if proposals_per_step < 1:
121
+ raise ValueError("proposals_per_step must be >= 1")
122
+ if max_demos_per_proposal < 0:
123
+ raise ValueError("max_demos_per_proposal must be >= 0")
124
+
125
+ self.iterations = iterations
126
+ self.minibatch_size = minibatch_size
127
+ self.exploration_probability = exploration_probability
128
+ self.full_eval_every = full_eval_every
129
+ self.population_size = population_size
130
+ self.proposals_per_step = proposals_per_step
131
+ self.max_demos_per_proposal = max_demos_per_proposal
82
132
  self.aggregate_instances = aggregate_instances
83
- self.scoring_adapter = scoring_adapter
133
+ self.scorer = scorer
84
134
 
85
- if config.max_demos_per_proposal > 0:
135
+ if max_demos_per_proposal > 0:
86
136
  self._strategies = [
87
137
  SIMBAStrategy.APPEND_DEMO,
88
138
  SIMBAStrategy.APPEND_RULE,
@@ -90,31 +140,25 @@ class SIMBARunner:
90
140
  else:
91
141
  self._strategies = [SIMBAStrategy.APPEND_RULE]
92
142
 
93
- # Random seeded from config is used for minibatch sampling, strategy
94
- # selection, and epsilon-greedy candidate selection.
95
- self.random_state = random.Random(config.random_seed)
143
+ # If no seed provided, use time-based seed
144
+ if random_seed is None:
145
+ random_seed = time.time_ns()
146
+ self.random_seed = random_seed
147
+ self.random_state = random.Random(random_seed)
96
148
 
97
149
  # Runtime state to be reset between runs
98
150
  self.reset_state()
99
151
 
100
152
  # Status callback set by PromptOptimizer:
101
153
  # (kind, step_index, total_steps, detail) -> None
102
- self.status_callback: Optional[RunnerStatusCallbackProtocol] = None
103
-
104
- # Model callback used by the rewriter set by PromptOptimizer.
105
- self.model_callback: Optional[
106
- Callable[
107
- ...,
108
- Union[
109
- str,
110
- Dict,
111
- Tuple[Union[str, Dict], float],
112
- ],
113
- ]
114
- ] = None
154
+ self.status_callback: Optional[RunnerStatusCallback] = None
115
155
 
116
- # Lazy-loaded PromptRewriter set by PromptOptimizer
117
- self._rewriter: Optional[PromptRewriter] = None
156
+ # Optimizer model used by the rewriter for prompt mutation.
157
+ # Set by PromptOptimizer.
158
+ self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
159
+
160
+ # Lazy-loaded Rewriter set by PromptOptimizer
161
+ self._rewriter: Optional[Rewriter] = None
118
162
 
119
163
  ##############
120
164
  # Public API #
@@ -122,10 +166,9 @@ class SIMBARunner:
122
166
 
123
167
  def execute(
124
168
  self,
125
- *,
126
169
  prompt: Prompt,
127
170
  goldens: Union[List[Golden], List[ConversationalGolden]],
128
- ) -> Tuple[Prompt, Dict]:
171
+ ) -> Tuple[Prompt, OptimizationReport]:
129
172
  """
130
173
  Synchronous SIMBA run from a full list of goldens.
131
174
 
@@ -140,8 +183,7 @@ class SIMBARunner:
140
183
  "the optimizer."
141
184
  )
142
185
 
143
- self._ensure_scoring_adapter()
144
- self._ensure_rewriter()
186
+ self._ensure_scorer()
145
187
  self.reset_state()
146
188
 
147
189
  # Seed candidate pool with the root prompt configuration.
@@ -167,7 +209,7 @@ class SIMBARunner:
167
209
  # candidate on the first iteration.
168
210
  if not self._minibatch_score_counts:
169
211
  seed_minibatch = self._draw_minibatch(goldens)
170
- root_score = self.scoring_adapter.minibatch_score(
212
+ root_score = self.scorer.score_minibatch(
171
213
  root_prompt_configuration, seed_minibatch
172
214
  )
173
215
  self._record_minibatch_score(
@@ -182,7 +224,7 @@ class SIMBARunner:
182
224
 
183
225
  # Compute shared feedback for this parent/minibatch that will be
184
226
  # used by all SIMBA proposals in this iteration.
185
- feedback_text = self.scoring_adapter.minibatch_feedback(
227
+ feedback_text = self.scorer.get_minibatch_feedback(
186
228
  parent_prompt_configuration, selected_module_id, minibatch
187
229
  )
188
230
 
@@ -190,10 +232,10 @@ class SIMBARunner:
190
232
  parent_prompt_configuration.id
191
233
  )
192
234
  jitter = 1e-6
193
- min_delta = max(self.config.min_delta, jitter)
235
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
194
236
 
195
237
  # 2. Generate multiple SIMBA child prompts and evaluate them.
196
- num_proposals = int(self.config.proposals_per_step)
238
+ num_proposals = int(self.proposals_per_step)
197
239
  for _ in range(num_proposals):
198
240
  strategy = self._sample_strategy()
199
241
  child_prompt = self._generate_child_prompt(
@@ -213,7 +255,7 @@ class SIMBARunner:
213
255
  child_prompt,
214
256
  )
215
257
 
216
- child_score = self.scoring_adapter.minibatch_score(
258
+ child_score = self.scorer.score_minibatch(
217
259
  child_prompt_configuration, minibatch
218
260
  )
219
261
 
@@ -238,8 +280,8 @@ class SIMBARunner:
238
280
 
239
281
  self.trial_index += 1
240
282
  if (
241
- self.config.full_eval_every is not None
242
- and self.trial_index % self.config.full_eval_every == 0
283
+ self.full_eval_every is not None
284
+ and self.trial_index % self.full_eval_every == 0
243
285
  ):
244
286
  self._full_evaluate_best(goldens)
245
287
 
@@ -255,7 +297,7 @@ class SIMBARunner:
255
297
  prompt_config_snapshots = build_prompt_config_snapshots(
256
298
  self.prompt_configurations_by_id
257
299
  )
258
- report = OptimizationResult(
300
+ report = OptimizationReport(
259
301
  optimization_id=self.optimization_id,
260
302
  best_id=best.id,
261
303
  accepted_iterations=accepted_iterations,
@@ -263,14 +305,13 @@ class SIMBARunner:
263
305
  parents=self.parents_by_id,
264
306
  prompt_configurations=prompt_config_snapshots,
265
307
  )
266
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
308
+ return best.prompts[self.SINGLE_MODULE_ID], report
267
309
 
268
310
  async def a_execute(
269
311
  self,
270
- *,
271
312
  prompt: Prompt,
272
313
  goldens: Union[List[Golden], List[ConversationalGolden]],
273
- ) -> Tuple[Prompt, Dict]:
314
+ ) -> Tuple[Prompt, OptimizationReport]:
274
315
  """
275
316
  Asynchronous twin of execute().
276
317
  """
@@ -282,8 +323,7 @@ class SIMBARunner:
282
323
  "the optimizer."
283
324
  )
284
325
 
285
- self._ensure_scoring_adapter()
286
- self._ensure_rewriter()
326
+ self._ensure_scorer()
287
327
  self.reset_state()
288
328
 
289
329
  seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
@@ -303,7 +343,7 @@ class SIMBARunner:
303
343
 
304
344
  if not self._minibatch_score_counts:
305
345
  seed_minibatch = self._draw_minibatch(goldens)
306
- root_score = await self.scoring_adapter.a_minibatch_score(
346
+ root_score = await self.scorer.a_score_minibatch(
307
347
  root_prompt_configuration, seed_minibatch
308
348
  )
309
349
  self._record_minibatch_score(
@@ -315,7 +355,7 @@ class SIMBARunner:
315
355
 
316
356
  minibatch = self._draw_minibatch(goldens)
317
357
 
318
- feedback_text = await self.scoring_adapter.a_minibatch_feedback(
358
+ feedback_text = await self.scorer.a_get_minibatch_feedback(
319
359
  parent_prompt_configuration, selected_module_id, minibatch
320
360
  )
321
361
 
@@ -323,9 +363,9 @@ class SIMBARunner:
323
363
  parent_prompt_configuration.id
324
364
  )
325
365
  jitter = 1e-6
326
- min_delta = max(self.config.min_delta, jitter)
366
+ min_delta = max(MIPROV2_MIN_DELTA, jitter)
327
367
 
328
- num_proposals = int(self.config.proposals_per_step)
368
+ num_proposals = int(self.proposals_per_step)
329
369
  for _ in range(num_proposals):
330
370
  strategy = self._sample_strategy()
331
371
  child_prompt = await self._a_generate_child_prompt(
@@ -344,7 +384,7 @@ class SIMBARunner:
344
384
  child_prompt,
345
385
  )
346
386
 
347
- child_score = await self.scoring_adapter.a_minibatch_score(
387
+ child_score = await self.scorer.a_score_minibatch(
348
388
  child_prompt_configuration, minibatch
349
389
  )
350
390
 
@@ -366,8 +406,8 @@ class SIMBARunner:
366
406
 
367
407
  self.trial_index += 1
368
408
  if (
369
- self.config.full_eval_every is not None
370
- and self.trial_index % self.config.full_eval_every == 0
409
+ self.full_eval_every is not None
410
+ and self.trial_index % self.full_eval_every == 0
371
411
  ):
372
412
  await self._a_full_evaluate_best(goldens)
373
413
 
@@ -382,7 +422,7 @@ class SIMBARunner:
382
422
  prompt_config_snapshots = build_prompt_config_snapshots(
383
423
  self.prompt_configurations_by_id
384
424
  )
385
- report = OptimizationResult(
425
+ report = OptimizationReport(
386
426
  optimization_id=self.optimization_id,
387
427
  best_id=best.id,
388
428
  accepted_iterations=accepted_iterations,
@@ -390,7 +430,7 @@ class SIMBARunner:
390
430
  parents=self.parents_by_id,
391
431
  prompt_configurations=prompt_config_snapshots,
392
432
  )
393
- return best.prompts[self.SINGLE_MODULE_ID], report.as_dict()
433
+ return best.prompts[self.SINGLE_MODULE_ID], report
394
434
 
395
435
  ###################
396
436
  # State & helpers #
@@ -414,25 +454,14 @@ class SIMBARunner:
414
454
  # Trial counter (used for full_eval_every).
415
455
  self.trial_index: int = 0
416
456
 
417
- def _ensure_scoring_adapter(self) -> None:
418
- if self.scoring_adapter is None:
457
+ def _ensure_scorer(self) -> None:
458
+ if self.scorer is None:
419
459
  raise DeepEvalError(
420
- "SIMBARunner requires a `scoring_adapter`. "
421
- "Construct one (for example, DeepEvalScoringAdapter) in "
422
- "PromptOptimizer and assign it to `runner.scoring_adapter`."
460
+ "SIMBARunner requires a `scorer`. "
461
+ "Construct one (for example, Scorer) in "
462
+ "PromptOptimizer and assign it to `runner.scorer`."
423
463
  )
424
464
 
425
- def _ensure_rewriter(self) -> None:
426
- if self._rewriter is not None:
427
- return
428
-
429
- # Default basic PromptRewriter; PromptOptimizer can override this and
430
- # pass a configured instance (e.g. with list-mutation config).
431
- self._rewriter = PromptRewriter(
432
- max_chars=self.config.rewrite_instruction_max_chars,
433
- random_state=self.random_state,
434
- )
435
-
436
465
  def _prompts_equivalent(
437
466
  self,
438
467
  old_prompt: Prompt,
@@ -484,9 +513,7 @@ class SIMBARunner:
484
513
 
485
514
  # If we exceed the population size, iteratively prune the worst
486
515
  # (by mean minibatch score), never removing the current best.
487
- while (
488
- len(self.prompt_configurations_by_id) > self.config.population_size
489
- ):
516
+ while len(self.prompt_configurations_by_id) > self.population_size:
490
517
  best_id: Optional[PromptConfigurationId] = None
491
518
  best_score = float("-inf")
492
519
  for cand_id in self.prompt_configurations_by_id.keys():
@@ -611,7 +638,7 @@ class SIMBARunner:
611
638
  "SIMBARunner has an empty candidate pool; this should not happen."
612
639
  )
613
640
 
614
- eps = float(self.config.exploration_probability)
641
+ eps = float(self.exploration_probability)
615
642
  if eps > 0.0 and self.random_state.random() < eps:
616
643
  chosen_id = self.random_state.choice(candidate_ids)
617
644
  else:
@@ -624,23 +651,14 @@ class SIMBARunner:
624
651
  goldens: Union[List[Golden], List[ConversationalGolden]],
625
652
  ) -> Union[List[Golden], List[ConversationalGolden]]:
626
653
  """
627
- Determine effective minibatch size from SIMBAConfig, bounded by the
628
- available goldens, and sample with replacement.
654
+ Determine effective minibatch size, bounded by the available goldens,
655
+ and sample with replacement.
629
656
  """
630
657
  n = len(goldens)
631
658
  if n <= 0:
632
659
  return []
633
660
 
634
- if self.config.minibatch_size is not None:
635
- size = self.config.minibatch_size
636
- else:
637
- dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
638
- size = max(
639
- self.config.minibatch_min_size,
640
- min(dynamic, self.config.minibatch_max_size),
641
- )
642
-
643
- size = max(1, min(size, n))
661
+ size = min(self.minibatch_size, n)
644
662
 
645
663
  return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
646
664
 
@@ -655,7 +673,7 @@ class SIMBARunner:
655
673
  if best.id in self.pareto_score_table:
656
674
  return
657
675
 
658
- scores = await self.scoring_adapter.a_score_on_pareto(best, goldens)
676
+ scores = await self.scorer.a_score_pareto(best, goldens)
659
677
  self.pareto_score_table[best.id] = scores
660
678
 
661
679
  def _full_evaluate_best(
@@ -669,7 +687,7 @@ class SIMBARunner:
669
687
  if best.id in self.pareto_score_table:
670
688
  return
671
689
 
672
- scores = self.scoring_adapter.score_on_pareto(best, goldens)
690
+ scores = self.scorer.score_pareto(best, goldens)
673
691
  self.pareto_score_table[best.id] = scores
674
692
 
675
693
  async def _a_generate_child_prompt(
@@ -694,7 +712,6 @@ class SIMBARunner:
694
712
  )
695
713
 
696
714
  new_prompt = await self._rewriter.a_rewrite(
697
- model_callback=self.model_callback,
698
715
  module_id=selected_module_id,
699
716
  old_prompt=old_prompt,
700
717
  feedback_text=strategy_feedback,
@@ -730,7 +747,6 @@ class SIMBARunner:
730
747
  )
731
748
 
732
749
  new_prompt = self._rewriter.rewrite(
733
- model_callback=self.model_callback,
734
750
  module_id=selected_module_id,
735
751
  old_prompt=old_prompt,
736
752
  feedback_text=strategy_feedback,
@@ -761,7 +777,7 @@ class SIMBARunner:
761
777
  Truncate strategy instructions + feedback to the configured character
762
778
  budget so the rewriter prompt does not explode.
763
779
  """
764
- max_chars = self.config.rewrite_instruction_max_chars
780
+ max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
765
781
  if max_chars <= 0:
766
782
  return text
767
783
  if len(text) <= max_chars:
@@ -788,15 +804,15 @@ class SIMBARunner:
788
804
  Context <- " ".join(golden.context) if present
789
805
  Output <- golden.expected_outcome
790
806
 
791
- All text segments are independently truncated to `demo_input_max_chars`.
807
+ All text segments are independently truncated to `SIMBA_DEMO_INPUT_MAX_CHARS`.
792
808
  """
793
- max_demos = self.config.max_demos_per_proposal
809
+ max_demos = self.max_demos_per_proposal
794
810
  if max_demos <= 0:
795
811
  return ""
796
812
 
797
813
  lines: List[str] = []
798
814
  demo_limit = min(max_demos, len(minibatch))
799
- max_chars = self.config.demo_input_max_chars
815
+ max_chars = SIMBA_DEMO_INPUT_MAX_CHARS
800
816
 
801
817
  for golden in minibatch[:demo_limit]:
802
818
  if isinstance(golden, Golden):
@@ -843,7 +859,7 @@ class SIMBARunner:
843
859
  ) -> str:
844
860
  """
845
861
  Construct a strategy-specific feedback string that is passed into
846
- PromptRewriter.rewrite / a_rewrite.
862
+ Rewriter.rewrite / a_rewrite.
847
863
 
848
864
  - APPEND_RULE: emphasize extracting a concise rule from metric feedback.
849
865
  - APPEND_DEMO: emphasize appending concrete demos built from goldens.
@@ -934,7 +950,7 @@ class SIMBARunner:
934
950
  self,
935
951
  simba_iteration: Callable[[], bool],
936
952
  ) -> None:
937
- total_iterations = self.config.iterations
953
+ total_iterations = self.iterations
938
954
  remaining_iterations = total_iterations
939
955
  iteration = 0
940
956
  self._update_progress(
@@ -960,7 +976,7 @@ class SIMBARunner:
960
976
  self,
961
977
  a_simba_iteration: Callable[[], Awaitable[bool]],
962
978
  ) -> None:
963
- total_iterations = self.config.iterations
979
+ total_iterations = self.iterations
964
980
  remaining_iterations = total_iterations
965
981
  iteration = 0
966
982
  self._update_progress(
@@ -2,27 +2,24 @@ from __future__ import annotations
2
2
  from enum import Enum
3
3
  from pydantic import BaseModel, Field, conint
4
4
  from typing import Optional
5
+ from deepeval.evaluate.configs import AsyncConfig
5
6
 
6
7
 
7
- class OptimizerDisplayConfig(BaseModel):
8
- """Display controls used by PromptOptimizer for all algorithms."""
9
-
8
+ class DisplayConfig(BaseModel):
10
9
  show_indicator: bool = True
11
10
  announce_ties: bool = Field(
12
11
  False, description="Print a one-line note when a tie is detected"
13
12
  )
14
13
 
15
14
 
16
- class PromptListMutationTargetType(Enum):
15
+ class MutationTargetType(Enum):
17
16
  RANDOM = "random"
18
17
  FIXED_INDEX = "fixed_index"
19
18
 
20
19
 
21
20
  # default all messages
22
- class PromptListMutationConfig(BaseModel):
23
- target_type: PromptListMutationTargetType = (
24
- PromptListMutationTargetType.RANDOM
25
- )
21
+ class MutationConfig(BaseModel):
22
+ target_type: MutationTargetType = MutationTargetType.RANDOM
26
23
  # should be list
27
24
  target_role: Optional[str] = Field(
28
25
  default=None,
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
- from typing import Dict, List, Sequence
2
+ from enum import Enum
3
3
  import random
4
+ from typing import Dict, List, Sequence, Optional, Tuple
4
5
 
5
6
  from deepeval.errors import DeepEvalError
6
- from deepeval.optimization.types import PromptConfigurationId, ScoreTable
7
+ from deepeval.optimizer.types import PromptConfigurationId, ScoreTable
7
8
 
8
9
 
9
10
  def _is_dominated(
@@ -164,3 +165,63 @@ def select_prompt_configuration_pareto(
164
165
  """
165
166
  freq = frequency_weights(score_table)
166
167
  return sample_by_frequency(freq, random_state=random_state)
168
+
169
+
170
+ class TieBreaker(str, Enum):
171
+ PREFER_ROOT = "prefer_root"
172
+ PREFER_CHILD = "prefer_child"
173
+ RANDOM = "random"
174
+
175
+
176
+ def pick_best_with_ties(
177
+ totals: Dict[PromptConfigurationId, float],
178
+ parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
179
+ *,
180
+ random_state: random.Random,
181
+ tie_tolerance: float = 1e-9,
182
+ policy: TieBreaker = TieBreaker.PREFER_ROOT,
183
+ ) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
184
+ """
185
+ Choose the best candidate by aggregate score with deterministic tie handling.
186
+
187
+ Returns: (chosen_id, tied_ids, max_score)
188
+ - tied_ids includes everyone within tie_tolerance of max_score
189
+ """
190
+ if not totals:
191
+ raise DeepEvalError("No candidate prompt configuration to choose from.")
192
+
193
+ max_score = max(totals.values())
194
+ tied = [
195
+ prompt_configuration_id
196
+ for prompt_configuration_id, score in totals.items()
197
+ if abs(score - max_score) <= tie_tolerance
198
+ ]
199
+
200
+ if len(tied) == 1:
201
+ return tied[0], tied, max_score
202
+
203
+ # Resolve tie by policy
204
+ if policy == TieBreaker.PREFER_CHILD:
205
+ # Prefer any non root. When multiple children exist, use the most recent
206
+ child_ids = [
207
+ prompt_configuration_id
208
+ for prompt_configuration_id in tied
209
+ if parents_by_id.get(prompt_configuration_id) is not None
210
+ ]
211
+ if child_ids:
212
+ # choose the newest child deterministically by order
213
+ for prompt_configuration_id in reversed(list(totals.keys())):
214
+ if prompt_configuration_id in child_ids:
215
+ return prompt_configuration_id, tied, max_score
216
+
217
+ if policy == TieBreaker.RANDOM:
218
+ return random_state.choice(tied), tied, max_score
219
+
220
+ # by default prefer a root if present, otherwise the first tied
221
+ root_ids = [
222
+ prompt_configuration_id
223
+ for prompt_configuration_id in tied
224
+ if parents_by_id.get(prompt_configuration_id) is None
225
+ ]
226
+ chosen = root_ids[0] if root_ids else tied[0]
227
+ return chosen, tied, max_score