deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,15 @@
1
+ from enum import Enum
2
+
3
+
4
+ class SIMBAStrategy(str, Enum):
5
+ """
6
+ Edit strategies used by SIMBA-style optimization.
7
+
8
+ - APPEND_DEMO: append one or more input/output demos distilled from the
9
+ current minibatch, similar in spirit to DSPy's `append_a_demo`.
10
+ - APPEND_RULE: append a concise natural-language rule distilled from
11
+ feedback, similar in spirit to DSPy's `append_a_rule`.
12
+ """
13
+
14
+ APPEND_DEMO = "append_demo"
15
+ APPEND_RULE = "append_rule"
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+ from enum import Enum
3
+ from pydantic import BaseModel, Field, conint
4
+ from typing import Optional
5
+ from deepeval.evaluate.configs import AsyncConfig
6
+
7
+
8
+ class DisplayConfig(BaseModel):
9
+ show_indicator: bool = True
10
+ announce_ties: bool = Field(
11
+ False, description="Print a one-line note when a tie is detected"
12
+ )
13
+
14
+
15
+ class MutationTargetType(Enum):
16
+ RANDOM = "random"
17
+ FIXED_INDEX = "fixed_index"
18
+
19
+
20
+ # default all messages
21
+ class MutationConfig(BaseModel):
22
+ target_type: MutationTargetType = MutationTargetType.RANDOM
23
+ # should be list
24
+ target_role: Optional[str] = Field(
25
+ default=None,
26
+ description="If set, restricts candidates to messages with this role (case insensitive).",
27
+ )
28
+ target_index: conint(ge=0) = Field(
29
+ default=0,
30
+ description="0-based index used when target_type == FIXED_INDEX.",
31
+ )
@@ -0,0 +1,227 @@
1
+ from __future__ import annotations
2
+ from enum import Enum
3
+ import random
4
+ from typing import Dict, List, Sequence, Optional, Tuple
5
+
6
+ from deepeval.errors import DeepEvalError
7
+ from deepeval.optimizer.types import PromptConfigurationId, ScoreTable
8
+
9
+
10
+ def _is_dominated(
11
+ candidate_scores: List[float], other_scores: List[float]
12
+ ) -> bool:
13
+ """
14
+ Return True if `candidate_scores` is dominated by `other_scores`:
15
+ (other >= candidate on all dimensions) AND (other > candidate on at least one).
16
+ """
17
+ other_ge_everywhere = all(
18
+ other_score >= candidate_score
19
+ for candidate_score, other_score in zip(candidate_scores, other_scores)
20
+ )
21
+ other_gt_somewhere = any(
22
+ other_score > candidate_score
23
+ for candidate_score, other_score in zip(candidate_scores, other_scores)
24
+ )
25
+ return other_ge_everywhere and other_gt_somewhere
26
+
27
+
28
+ def pareto_frontier(
29
+ prompt_configuration_ids: Sequence[PromptConfigurationId],
30
+ score_table: ScoreTable,
31
+ ) -> List[PromptConfigurationId]:
32
+ """
33
+ Compute the set of non-dominated candidates given their scores.
34
+ Returns PromptConfigurationIds on the Pareto frontier.
35
+ """
36
+ frontier: List[PromptConfigurationId] = []
37
+ for prompt_configuration_id in prompt_configuration_ids:
38
+ candidate_vector = score_table[prompt_configuration_id]
39
+ dominated = False
40
+
41
+ # If any existing frontier member dominates this candidate, skip it.
42
+ for frontier_id in frontier:
43
+ if _is_dominated(candidate_vector, score_table[frontier_id]):
44
+ dominated = True
45
+ break
46
+ if dominated:
47
+ continue
48
+
49
+ # Remove any frontier member that is dominated by this candidate.
50
+ frontier = [
51
+ f_id
52
+ for f_id in frontier
53
+ if not _is_dominated(score_table[f_id], candidate_vector)
54
+ ]
55
+ frontier.append(prompt_configuration_id)
56
+
57
+ return frontier
58
+
59
+
60
+ def frequency_weights(
61
+ score_table: ScoreTable,
62
+ ) -> Dict[PromptConfigurationId, int]:
63
+ """
64
+ Build best sets, remove dominated candidates, and count appearances.
65
+
66
+ Returns:
67
+ A map {prompt_configuration_id -> frequency} counting how often each
68
+ globally non-dominated prompt configuration appears among the instance
69
+ Pareto sets.
70
+ """
71
+ if not score_table:
72
+ return {}
73
+
74
+ # Assume all score vectors have the same length.
75
+ example_vector = next(iter(score_table.values()))
76
+ num_instances = len(example_vector)
77
+ all_candidates = list(score_table.keys())
78
+
79
+ per_instance_frontiers: List[List[PromptConfigurationId]] = []
80
+ for i in range(num_instances):
81
+ best_score_i = max(
82
+ score_table[prompt_configuration_id][i]
83
+ for prompt_configuration_id in all_candidates
84
+ )
85
+ winners_i = [
86
+ prompt_configuration_id
87
+ for prompt_configuration_id in all_candidates
88
+ if score_table[prompt_configuration_id][i] == best_score_i
89
+ ]
90
+
91
+ # Instance frontier among winners. We pass 1-D score vectors
92
+ # so this reduces to "all candidates with the max score at instance i",
93
+ instance_frontier = pareto_frontier(
94
+ winners_i,
95
+ {
96
+ prompt_configuration_id: [
97
+ score_table[prompt_configuration_id][i]
98
+ ]
99
+ for prompt_configuration_id in winners_i
100
+ },
101
+ )
102
+ per_instance_frontiers.append(instance_frontier)
103
+
104
+ # Global candidate set appearing in any winners
105
+ candidate_union = sorted(
106
+ {
107
+ prompt_configuration_id
108
+ for winners in per_instance_frontiers
109
+ for prompt_configuration_id in winners
110
+ }
111
+ )
112
+ global_frontier = pareto_frontier(candidate_union, score_table)
113
+
114
+ # Count frequency only for candidates on the global frontier
115
+ frequency_by_prompt_config: Dict[PromptConfigurationId, int] = {
116
+ prompt_configuration_id: 0
117
+ for prompt_configuration_id in global_frontier
118
+ }
119
+ for winners in per_instance_frontiers:
120
+ for prompt_configuration_id in winners:
121
+ if prompt_configuration_id in frequency_by_prompt_config:
122
+ frequency_by_prompt_config[prompt_configuration_id] += 1
123
+
124
+ return frequency_by_prompt_config
125
+
126
+
127
+ def sample_by_frequency(
128
+ frequency_by_prompt_config: Dict[PromptConfigurationId, int],
129
+ *,
130
+ random_state: random.Random,
131
+ ) -> PromptConfigurationId:
132
+ """
133
+ Sample a prompt configuration id with probability proportional to its frequency.
134
+ Falls back to uniform if the total weight is zero.
135
+ """
136
+ if not frequency_by_prompt_config:
137
+ raise DeepEvalError("No prompt configurations to sample.")
138
+
139
+ items = list(frequency_by_prompt_config.items())
140
+ total_weight = sum(weight for _, weight in items)
141
+
142
+ if total_weight == 0:
143
+ # Uniform fallback
144
+ return random_state.choice(
145
+ [prompt_configuration_id for prompt_configuration_id, _ in items]
146
+ )
147
+
148
+ r = random_state.uniform(0, total_weight)
149
+ cumulative = 0.0
150
+ for prompt_configuration_id, weight in items:
151
+ cumulative += weight
152
+ if r <= cumulative:
153
+ return prompt_configuration_id
154
+ return items[-1][0]
155
+
156
+
157
+ def select_prompt_configuration_pareto(
158
+ score_table: ScoreTable, *, random_state: random.Random
159
+ ) -> PromptConfigurationId:
160
+ """
161
+ Frequency weighted sampling over the Pareto winners,
162
+ restricted to globally non-dominated prompt configurations. A configuration
163
+ is globally non-dominated if no other configuration dominates it using
164
+ the full vector.
165
+ """
166
+ freq = frequency_weights(score_table)
167
+ return sample_by_frequency(freq, random_state=random_state)
168
+
169
+
170
+ class TieBreaker(str, Enum):
171
+ PREFER_ROOT = "prefer_root"
172
+ PREFER_CHILD = "prefer_child"
173
+ RANDOM = "random"
174
+
175
+
176
+ def pick_best_with_ties(
177
+ totals: Dict[PromptConfigurationId, float],
178
+ parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
179
+ *,
180
+ random_state: random.Random,
181
+ tie_tolerance: float = 1e-9,
182
+ policy: TieBreaker = TieBreaker.PREFER_ROOT,
183
+ ) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
184
+ """
185
+ Choose the best candidate by aggregate score with deterministic tie handling.
186
+
187
+ Returns: (chosen_id, tied_ids, max_score)
188
+ - tied_ids includes everyone within tie_tolerance of max_score
189
+ """
190
+ if not totals:
191
+ raise DeepEvalError("No candidate prompt configuration to choose from.")
192
+
193
+ max_score = max(totals.values())
194
+ tied = [
195
+ prompt_configuration_id
196
+ for prompt_configuration_id, score in totals.items()
197
+ if abs(score - max_score) <= tie_tolerance
198
+ ]
199
+
200
+ if len(tied) == 1:
201
+ return tied[0], tied, max_score
202
+
203
+ # Resolve tie by policy
204
+ if policy == TieBreaker.PREFER_CHILD:
205
+ # Prefer any non root. When multiple children exist, use the most recent
206
+ child_ids = [
207
+ prompt_configuration_id
208
+ for prompt_configuration_id in tied
209
+ if parents_by_id.get(prompt_configuration_id) is not None
210
+ ]
211
+ if child_ids:
212
+ # choose the newest child deterministically by order
213
+ for prompt_configuration_id in reversed(list(totals.keys())):
214
+ if prompt_configuration_id in child_ids:
215
+ return prompt_configuration_id, tied, max_score
216
+
217
+ if policy == TieBreaker.RANDOM:
218
+ return random_state.choice(tied), tied, max_score
219
+
220
+ # by default prefer a root if present, otherwise the first tied
221
+ root_ids = [
222
+ prompt_configuration_id
223
+ for prompt_configuration_id in tied
224
+ if parents_by_id.get(prompt_configuration_id) is None
225
+ ]
226
+ chosen = root_ids[0] if root_ids else tied[0]
227
+ return chosen, tied, max_score
@@ -0,0 +1,263 @@
1
+ from contextlib import contextmanager
2
+ from typing import (
3
+ Callable,
4
+ Dict,
5
+ List,
6
+ Optional,
7
+ Tuple,
8
+ Union,
9
+ )
10
+
11
+ from rich.progress import (
12
+ Progress,
13
+ SpinnerColumn,
14
+ BarColumn,
15
+ TextColumn,
16
+ TimeElapsedColumn,
17
+ )
18
+
19
+ from deepeval.dataset.golden import Golden, ConversationalGolden
20
+ from deepeval.errors import DeepEvalError
21
+ from deepeval.metrics import BaseConversationalMetric, BaseMetric
22
+ from deepeval.metrics.utils import initialize_model
23
+ from deepeval.models.base_model import DeepEvalBaseLLM
24
+ from deepeval.optimizer.scorer import Scorer
25
+ from deepeval.optimizer.rewriter import Rewriter
26
+ from deepeval.optimizer.types import (
27
+ ModelCallback,
28
+ RunnerStatusType,
29
+ )
30
+ from deepeval.optimizer.utils import (
31
+ validate_callback,
32
+ validate_metrics,
33
+ )
34
+ from deepeval.optimizer.configs import (
35
+ DisplayConfig,
36
+ MutationConfig,
37
+ AsyncConfig,
38
+ )
39
+ from deepeval.prompt.prompt import Prompt
40
+ from deepeval.utils import get_or_create_event_loop
41
+ from deepeval.optimizer.algorithms import (
42
+ GEPA,
43
+ MIPROV2,
44
+ COPRO,
45
+ SIMBA,
46
+ )
47
+ from deepeval.optimizer.algorithms.configs import (
48
+ GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
49
+ MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
50
+ )
51
+
52
+
53
+ class PromptOptimizer:
54
+ def __init__(
55
+ self,
56
+ model_callback: ModelCallback,
57
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
58
+ optimizer_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
59
+ algorithm: Union[GEPA, MIPROV2, COPRO, SIMBA] = GEPA(),
60
+ async_config: Optional[AsyncConfig] = AsyncConfig(),
61
+ display_config: Optional[DisplayConfig] = DisplayConfig(),
62
+ mutation_config: Optional[MutationConfig] = MutationConfig(),
63
+ ):
64
+ self.optimizer_model, self.using_native_model = initialize_model(
65
+ optimizer_model
66
+ )
67
+ self.model_callback = validate_callback(
68
+ component="PromptOptimizer",
69
+ model_callback=model_callback,
70
+ )
71
+ self.metrics = validate_metrics(
72
+ component="PromptOptimizer", metrics=metrics
73
+ )
74
+
75
+ self.async_config = async_config
76
+ self.display_config = display_config
77
+ self.mutation_config = mutation_config
78
+ self.algorithm = algorithm
79
+ self.optimization_report = None
80
+ self._configure_algorithm()
81
+
82
+ # Internal state used only when a progress indicator is active.
83
+ # Tuple is (Progress instance, task_id).
84
+ self._progress_state: Optional[Tuple[Progress, int]] = None
85
+
86
+ ##############
87
+ # Public API #
88
+ ##############
89
+
90
+ def optimize(
91
+ self,
92
+ prompt: Prompt,
93
+ goldens: Union[List[Golden], List[ConversationalGolden]],
94
+ ) -> Prompt:
95
+ if self.async_config.run_async:
96
+ loop = get_or_create_event_loop()
97
+ return loop.run_until_complete(
98
+ self.a_optimize(prompt=prompt, goldens=goldens)
99
+ )
100
+
101
+ try:
102
+ with self._progress_context():
103
+ best_prompt, self.optimization_report = self.algorithm.execute(
104
+ prompt=prompt, goldens=goldens
105
+ )
106
+ except Exception as exc:
107
+ self._handle_optimization_error(exc)
108
+
109
+ return best_prompt
110
+
111
+ async def a_optimize(
112
+ self,
113
+ prompt: Prompt,
114
+ goldens: Union[List[Golden], List[ConversationalGolden]],
115
+ ) -> Prompt:
116
+ try:
117
+ with self._progress_context():
118
+ best_prompt, self.optimization_report = (
119
+ await self.algorithm.a_execute(
120
+ prompt=prompt, goldens=goldens
121
+ )
122
+ )
123
+ except Exception as exc:
124
+ self._handle_optimization_error(exc)
125
+
126
+ return best_prompt
127
+
128
+ ####################
129
+ # Internal helpers #
130
+ ####################
131
+
132
+ def _configure_algorithm(self) -> None:
133
+ """Configure the algorithm with scorer, rewriter, and callbacks."""
134
+ self.algorithm.scorer = Scorer(
135
+ model_callback=self.model_callback,
136
+ metrics=self.metrics,
137
+ max_concurrent=self.async_config.max_concurrent,
138
+ throttle_seconds=float(self.async_config.throttle_value),
139
+ )
140
+
141
+ # Attach rewriter for mutation behavior
142
+ # GEPA uses internal constant; other algorithms use MIPROV2 constant
143
+ if isinstance(self.algorithm, GEPA):
144
+ max_chars = GEPA_REWRITE_INSTRUCTION_MAX_CHARS
145
+ else:
146
+ max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
147
+ self.algorithm._rewriter = Rewriter(
148
+ optimizer_model=self.optimizer_model,
149
+ max_chars=max_chars,
150
+ list_mutation_config=self.mutation_config,
151
+ random_state=self.algorithm.random_state,
152
+ )
153
+
154
+ # Set status callback
155
+ self.algorithm.status_callback = self._on_status
156
+
157
+ @contextmanager
158
+ def _progress_context(self):
159
+ """Context manager that sets up progress indicator if enabled."""
160
+ if not self.display_config.show_indicator:
161
+ yield
162
+ return
163
+
164
+ with Progress(
165
+ SpinnerColumn(style="rgb(106,0,255)"),
166
+ TextColumn("[progress.description]{task.description}"),
167
+ BarColumn(bar_width=40),
168
+ TimeElapsedColumn(),
169
+ transient=True,
170
+ ) as progress:
171
+ task = progress.add_task(
172
+ f"Optimizing prompt with {self.algorithm.name}..."
173
+ )
174
+ self._progress_state = (progress, task)
175
+ try:
176
+ yield
177
+ finally:
178
+ self._progress_state = None
179
+
180
+ def _handle_optimization_error(self, exc: Exception) -> None:
181
+ """
182
+ Handle optimization errors by formatting and raising a user-friendly message.
183
+ """
184
+ total_steps: Optional[int] = None
185
+ iterations: Optional[int] = getattr(self.algorithm, "iterations", None)
186
+ if iterations is not None:
187
+ total_steps = int(iterations)
188
+
189
+ prefix = f"(iterations={iterations}) " if iterations is not None else ""
190
+ detail = (
191
+ f"{prefix}• error {exc.__class__.__name__}: {exc} "
192
+ "• halted before first iteration"
193
+ )
194
+
195
+ self._on_status(
196
+ RunnerStatusType.ERROR,
197
+ detail=detail,
198
+ step_index=None,
199
+ total_steps=total_steps,
200
+ )
201
+
202
+ algo = self.algorithm.name
203
+ raise DeepEvalError(f"[{algo}] {detail}") from None
204
+
205
+ def _on_status(
206
+ self,
207
+ kind: RunnerStatusType,
208
+ detail: str,
209
+ step_index: Optional[int] = None,
210
+ total_steps: Optional[int] = None,
211
+ ) -> None:
212
+ """
213
+ Unified status callback used by the algorithm.
214
+
215
+ - PROGRESS: update the progress bar description and position
216
+ - TIE: optionally print a tie message
217
+ - ERROR: print a concise error message and allow the run to halt
218
+ """
219
+ algo = self.algorithm.name
220
+
221
+ if kind is RunnerStatusType.ERROR:
222
+ if self._progress_state is not None:
223
+ progress, task = self._progress_state
224
+ if total_steps is not None:
225
+ progress.update(task, total=total_steps)
226
+ description = self._format_progress_description(detail)
227
+ progress.update(task, description=description)
228
+ print(f"[{algo}] {detail}")
229
+ return
230
+
231
+ if kind is RunnerStatusType.TIE:
232
+ if not self.display_config.announce_ties:
233
+ return
234
+ print(f"[{algo}] {detail}")
235
+ return
236
+
237
+ if kind is not RunnerStatusType.PROGRESS:
238
+ return
239
+
240
+ if self._progress_state is None:
241
+ return
242
+
243
+ progress, task = self._progress_state
244
+
245
+ if total_steps is not None:
246
+ progress.update(task, total=total_steps)
247
+
248
+ if step_index is not None and step_index > 0:
249
+ progress.advance(task, 1)
250
+
251
+ description = self._format_progress_description(detail)
252
+ progress.update(task, description=description)
253
+
254
+ def _format_progress_description(self, detail: str) -> str:
255
+ """
256
+ Compose a human readable progress line using an algorithm agnostic
257
+ prefix and an algorithm specific detail string provided by the algorithm.
258
+ """
259
+ algo = self.algorithm.name
260
+ base = f"Optimizing prompt with {algo}"
261
+ if detail:
262
+ return f"{base} [rgb(25,227,160)]{detail}[/]"
263
+ return base
@@ -0,0 +1,5 @@
1
+ from .rewriter import Rewriter
2
+
3
+ __all__ = [
4
+ "Rewriter",
5
+ ]
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+ import random
3
+ from typing import Optional, Tuple, Union
4
+
5
+ from deepeval.models.base_model import DeepEvalBaseLLM
6
+ from deepeval.optimizer.types import (
7
+ ModuleId,
8
+ )
9
+ from deepeval.optimizer.configs import (
10
+ MutationConfig,
11
+ )
12
+ from deepeval.prompt.prompt import Prompt
13
+ from deepeval.optimizer.rewriter.utils import (
14
+ _summarize_prompt_for_rewrite,
15
+ _compose_prompt_messages,
16
+ _normalize_llm_output_to_text,
17
+ _apply_rewritten_prompt,
18
+ )
19
+
20
+
21
+ class Rewriter:
22
+ """
23
+ Uses a provided DeepEval model to rewrite the prompt for a module,
24
+ guided by feedback_text (μ_f).
25
+
26
+ For LIST prompts, the target message to rewrite is chosen according to
27
+ `list_mutation_config` and `random_state`.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ optimizer_model: DeepEvalBaseLLM,
33
+ max_chars: int = 4000,
34
+ list_mutation_config: Optional[MutationConfig] = None,
35
+ random_state: Optional[Union[int, random.Random]] = None,
36
+ ):
37
+ self.optimizer_model = optimizer_model
38
+ self.max_chars = max_chars
39
+ self.list_mutation_config = list_mutation_config or MutationConfig()
40
+
41
+ # Accept either an int seed or a Random instance.
42
+ if isinstance(random_state, int):
43
+ self.random_state: Optional[random.Random] = random.Random(
44
+ random_state
45
+ )
46
+ else:
47
+ self.random_state = random_state or random.Random()
48
+
49
+ def _compose_messages(
50
+ self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
51
+ ) -> Tuple[str, str]:
52
+ current_prompt_block = _summarize_prompt_for_rewrite(
53
+ old_prompt, self.max_chars
54
+ )
55
+ system_message = (
56
+ "You are refining a prompt used in a multi-step LLM pipeline. "
57
+ "Given the current prompt and concise feedback, produce a revised prompt "
58
+ "that addresses the issues while preserving intent and style. "
59
+ "Return only the new prompt text, no explanations."
60
+ )
61
+ user_message = f"""[Current Prompt]
62
+ {current_prompt_block}
63
+
64
+ [Feedback]
65
+ {feedback_text[:self.max_chars]}
66
+
67
+ [Instruction]
68
+ Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text.
69
+ """
70
+ return system_message, user_message
71
+
72
+ def rewrite(
73
+ self,
74
+ module_id: ModuleId,
75
+ old_prompt: Prompt,
76
+ feedback_text: str,
77
+ ) -> Prompt:
78
+ if not feedback_text.strip():
79
+ return old_prompt
80
+
81
+ system_message, user_message = self._compose_messages(
82
+ module_id=module_id,
83
+ old_prompt=old_prompt,
84
+ feedback_text=feedback_text,
85
+ )
86
+ merged_prompt_text = _compose_prompt_messages(
87
+ system_message, user_message
88
+ )
89
+
90
+ out = self.optimizer_model.generate(merged_prompt_text)
91
+ new_text = _normalize_llm_output_to_text(out)
92
+ return _apply_rewritten_prompt(
93
+ old_prompt,
94
+ new_text,
95
+ self.random_state,
96
+ self.list_mutation_config,
97
+ )
98
+
99
+ async def a_rewrite(
100
+ self,
101
+ module_id: ModuleId,
102
+ old_prompt: Prompt,
103
+ feedback_text: str,
104
+ ) -> Prompt:
105
+ if not feedback_text.strip():
106
+ return old_prompt
107
+
108
+ system_message, user_message = self._compose_messages(
109
+ module_id=module_id,
110
+ old_prompt=old_prompt,
111
+ feedback_text=feedback_text,
112
+ )
113
+ merged_prompt_text = _compose_prompt_messages(
114
+ system_message, user_message
115
+ )
116
+
117
+ out = await self.optimizer_model.a_generate(merged_prompt_text)
118
+ new_text = _normalize_llm_output_to_text(out)
119
+ return _apply_rewritten_prompt(
120
+ old_prompt,
121
+ new_text,
122
+ self.random_state,
123
+ self.list_mutation_config,
124
+ )