deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,263 @@
1
+ from contextlib import contextmanager
2
+ from typing import (
3
+ Callable,
4
+ Dict,
5
+ List,
6
+ Optional,
7
+ Tuple,
8
+ Union,
9
+ )
10
+
11
+ from rich.progress import (
12
+ Progress,
13
+ SpinnerColumn,
14
+ BarColumn,
15
+ TextColumn,
16
+ TimeElapsedColumn,
17
+ )
18
+
19
+ from deepeval.dataset.golden import Golden, ConversationalGolden
20
+ from deepeval.errors import DeepEvalError
21
+ from deepeval.metrics import BaseConversationalMetric, BaseMetric
22
+ from deepeval.metrics.utils import initialize_model
23
+ from deepeval.models.base_model import DeepEvalBaseLLM
24
+ from deepeval.optimizer.scorer import Scorer
25
+ from deepeval.optimizer.rewriter import Rewriter
26
+ from deepeval.optimizer.types import (
27
+ ModelCallback,
28
+ RunnerStatusType,
29
+ )
30
+ from deepeval.optimizer.utils import (
31
+ validate_callback,
32
+ validate_metrics,
33
+ )
34
+ from deepeval.optimizer.configs import (
35
+ DisplayConfig,
36
+ MutationConfig,
37
+ AsyncConfig,
38
+ )
39
+ from deepeval.prompt.prompt import Prompt
40
+ from deepeval.utils import get_or_create_event_loop
41
+ from deepeval.optimizer.algorithms import (
42
+ GEPA,
43
+ MIPROV2,
44
+ COPRO,
45
+ SIMBA,
46
+ )
47
+ from deepeval.optimizer.algorithms.configs import (
48
+ GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
49
+ MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
50
+ )
51
+
52
+
53
+ class PromptOptimizer:
54
+ def __init__(
55
+ self,
56
+ model_callback: ModelCallback,
57
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
58
+ optimizer_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
59
+ algorithm: Union[GEPA, MIPROV2, COPRO, SIMBA] = GEPA(),
60
+ async_config: Optional[AsyncConfig] = AsyncConfig(),
61
+ display_config: Optional[DisplayConfig] = DisplayConfig(),
62
+ mutation_config: Optional[MutationConfig] = MutationConfig(),
63
+ ):
64
+ self.optimizer_model, self.using_native_model = initialize_model(
65
+ optimizer_model
66
+ )
67
+ self.model_callback = validate_callback(
68
+ component="PromptOptimizer",
69
+ model_callback=model_callback,
70
+ )
71
+ self.metrics = validate_metrics(
72
+ component="PromptOptimizer", metrics=metrics
73
+ )
74
+
75
+ self.async_config = async_config
76
+ self.display_config = display_config
77
+ self.mutation_config = mutation_config
78
+ self.algorithm = algorithm
79
+ self.optimization_report = None
80
+ self._configure_algorithm()
81
+
82
+ # Internal state used only when a progress indicator is active.
83
+ # Tuple is (Progress instance, task_id).
84
+ self._progress_state: Optional[Tuple[Progress, int]] = None
85
+
86
+ ##############
87
+ # Public API #
88
+ ##############
89
+
90
+ def optimize(
91
+ self,
92
+ prompt: Prompt,
93
+ goldens: Union[List[Golden], List[ConversationalGolden]],
94
+ ) -> Prompt:
95
+ if self.async_config.run_async:
96
+ loop = get_or_create_event_loop()
97
+ return loop.run_until_complete(
98
+ self.a_optimize(prompt=prompt, goldens=goldens)
99
+ )
100
+
101
+ try:
102
+ with self._progress_context():
103
+ best_prompt, self.optimization_report = self.algorithm.execute(
104
+ prompt=prompt, goldens=goldens
105
+ )
106
+ except Exception as exc:
107
+ self._handle_optimization_error(exc)
108
+
109
+ return best_prompt
110
+
111
+ async def a_optimize(
112
+ self,
113
+ prompt: Prompt,
114
+ goldens: Union[List[Golden], List[ConversationalGolden]],
115
+ ) -> Prompt:
116
+ try:
117
+ with self._progress_context():
118
+ best_prompt, self.optimization_report = (
119
+ await self.algorithm.a_execute(
120
+ prompt=prompt, goldens=goldens
121
+ )
122
+ )
123
+ except Exception as exc:
124
+ self._handle_optimization_error(exc)
125
+
126
+ return best_prompt
127
+
128
+ ####################
129
+ # Internal helpers #
130
+ ####################
131
+
132
+ def _configure_algorithm(self) -> None:
133
+ """Configure the algorithm with scorer, rewriter, and callbacks."""
134
+ self.algorithm.scorer = Scorer(
135
+ model_callback=self.model_callback,
136
+ metrics=self.metrics,
137
+ max_concurrent=self.async_config.max_concurrent,
138
+ throttle_seconds=float(self.async_config.throttle_value),
139
+ )
140
+
141
+ # Attach rewriter for mutation behavior
142
+ # GEPA uses internal constant; other algorithms use MIPROV2 constant
143
+ if isinstance(self.algorithm, GEPA):
144
+ max_chars = GEPA_REWRITE_INSTRUCTION_MAX_CHARS
145
+ else:
146
+ max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
147
+ self.algorithm._rewriter = Rewriter(
148
+ optimizer_model=self.optimizer_model,
149
+ max_chars=max_chars,
150
+ list_mutation_config=self.mutation_config,
151
+ random_state=self.algorithm.random_state,
152
+ )
153
+
154
+ # Set status callback
155
+ self.algorithm.status_callback = self._on_status
156
+
157
+ @contextmanager
158
+ def _progress_context(self):
159
+ """Context manager that sets up progress indicator if enabled."""
160
+ if not self.display_config.show_indicator:
161
+ yield
162
+ return
163
+
164
+ with Progress(
165
+ SpinnerColumn(style="rgb(106,0,255)"),
166
+ TextColumn("[progress.description]{task.description}"),
167
+ BarColumn(bar_width=40),
168
+ TimeElapsedColumn(),
169
+ transient=True,
170
+ ) as progress:
171
+ task = progress.add_task(
172
+ f"Optimizing prompt with {self.algorithm.name}..."
173
+ )
174
+ self._progress_state = (progress, task)
175
+ try:
176
+ yield
177
+ finally:
178
+ self._progress_state = None
179
+
180
+ def _handle_optimization_error(self, exc: Exception) -> None:
181
+ """
182
+ Handle optimization errors by formatting and raising a user-friendly message.
183
+ """
184
+ total_steps: Optional[int] = None
185
+ iterations: Optional[int] = getattr(self.algorithm, "iterations", None)
186
+ if iterations is not None:
187
+ total_steps = int(iterations)
188
+
189
+ prefix = f"(iterations={iterations}) " if iterations is not None else ""
190
+ detail = (
191
+ f"{prefix}• error {exc.__class__.__name__}: {exc} "
192
+ "• halted before first iteration"
193
+ )
194
+
195
+ self._on_status(
196
+ RunnerStatusType.ERROR,
197
+ detail=detail,
198
+ step_index=None,
199
+ total_steps=total_steps,
200
+ )
201
+
202
+ algo = self.algorithm.name
203
+ raise DeepEvalError(f"[{algo}] {detail}") from None
204
+
205
+ def _on_status(
206
+ self,
207
+ kind: RunnerStatusType,
208
+ detail: str,
209
+ step_index: Optional[int] = None,
210
+ total_steps: Optional[int] = None,
211
+ ) -> None:
212
+ """
213
+ Unified status callback used by the algorithm.
214
+
215
+ - PROGRESS: update the progress bar description and position
216
+ - TIE: optionally print a tie message
217
+ - ERROR: print a concise error message and allow the run to halt
218
+ """
219
+ algo = self.algorithm.name
220
+
221
+ if kind is RunnerStatusType.ERROR:
222
+ if self._progress_state is not None:
223
+ progress, task = self._progress_state
224
+ if total_steps is not None:
225
+ progress.update(task, total=total_steps)
226
+ description = self._format_progress_description(detail)
227
+ progress.update(task, description=description)
228
+ print(f"[{algo}] {detail}")
229
+ return
230
+
231
+ if kind is RunnerStatusType.TIE:
232
+ if not self.display_config.announce_ties:
233
+ return
234
+ print(f"[{algo}] {detail}")
235
+ return
236
+
237
+ if kind is not RunnerStatusType.PROGRESS:
238
+ return
239
+
240
+ if self._progress_state is None:
241
+ return
242
+
243
+ progress, task = self._progress_state
244
+
245
+ if total_steps is not None:
246
+ progress.update(task, total=total_steps)
247
+
248
+ if step_index is not None and step_index > 0:
249
+ progress.advance(task, 1)
250
+
251
+ description = self._format_progress_description(detail)
252
+ progress.update(task, description=description)
253
+
254
+ def _format_progress_description(self, detail: str) -> str:
255
+ """
256
+ Compose a human readable progress line using an algorithm agnostic
257
+ prefix and an algorithm specific detail string provided by the algorithm.
258
+ """
259
+ algo = self.algorithm.name
260
+ base = f"Optimizing prompt with {algo}"
261
+ if detail:
262
+ return f"{base} [rgb(25,227,160)]{detail}[/]"
263
+ return base
@@ -0,0 +1,5 @@
1
+ from .rewriter import Rewriter
2
+
3
+ __all__ = [
4
+ "Rewriter",
5
+ ]
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+ import random
3
+ from typing import Optional, Tuple, Union
4
+
5
+ from deepeval.models.base_model import DeepEvalBaseLLM
6
+ from deepeval.optimizer.types import (
7
+ ModuleId,
8
+ )
9
+ from deepeval.optimizer.configs import (
10
+ MutationConfig,
11
+ )
12
+ from deepeval.prompt.prompt import Prompt
13
+ from deepeval.optimizer.rewriter.utils import (
14
+ _summarize_prompt_for_rewrite,
15
+ _compose_prompt_messages,
16
+ _normalize_llm_output_to_text,
17
+ _apply_rewritten_prompt,
18
+ )
19
+
20
+
21
+ class Rewriter:
22
+ """
23
+ Uses a provided DeepEval model to rewrite the prompt for a module,
24
+ guided by feedback_text (μ_f).
25
+
26
+ For LIST prompts, the target message to rewrite is chosen according to
27
+ `list_mutation_config` and `random_state`.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ optimizer_model: DeepEvalBaseLLM,
33
+ max_chars: int = 4000,
34
+ list_mutation_config: Optional[MutationConfig] = None,
35
+ random_state: Optional[Union[int, random.Random]] = None,
36
+ ):
37
+ self.optimizer_model = optimizer_model
38
+ self.max_chars = max_chars
39
+ self.list_mutation_config = list_mutation_config or MutationConfig()
40
+
41
+ # Accept either an int seed or a Random instance.
42
+ if isinstance(random_state, int):
43
+ self.random_state: Optional[random.Random] = random.Random(
44
+ random_state
45
+ )
46
+ else:
47
+ self.random_state = random_state or random.Random()
48
+
49
+ def _compose_messages(
50
+ self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
51
+ ) -> Tuple[str, str]:
52
+ current_prompt_block = _summarize_prompt_for_rewrite(
53
+ old_prompt, self.max_chars
54
+ )
55
+ system_message = (
56
+ "You are refining a prompt used in a multi-step LLM pipeline. "
57
+ "Given the current prompt and concise feedback, produce a revised prompt "
58
+ "that addresses the issues while preserving intent and style. "
59
+ "Return only the new prompt text, no explanations."
60
+ )
61
+ user_message = f"""[Current Prompt]
62
+ {current_prompt_block}
63
+
64
+ [Feedback]
65
+ {feedback_text[:self.max_chars]}
66
+
67
+ [Instruction]
68
+ Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text.
69
+ """
70
+ return system_message, user_message
71
+
72
+ def rewrite(
73
+ self,
74
+ module_id: ModuleId,
75
+ old_prompt: Prompt,
76
+ feedback_text: str,
77
+ ) -> Prompt:
78
+ if not feedback_text.strip():
79
+ return old_prompt
80
+
81
+ system_message, user_message = self._compose_messages(
82
+ module_id=module_id,
83
+ old_prompt=old_prompt,
84
+ feedback_text=feedback_text,
85
+ )
86
+ merged_prompt_text = _compose_prompt_messages(
87
+ system_message, user_message
88
+ )
89
+
90
+ out = self.optimizer_model.generate(merged_prompt_text)
91
+ new_text = _normalize_llm_output_to_text(out)
92
+ return _apply_rewritten_prompt(
93
+ old_prompt,
94
+ new_text,
95
+ self.random_state,
96
+ self.list_mutation_config,
97
+ )
98
+
99
+ async def a_rewrite(
100
+ self,
101
+ module_id: ModuleId,
102
+ old_prompt: Prompt,
103
+ feedback_text: str,
104
+ ) -> Prompt:
105
+ if not feedback_text.strip():
106
+ return old_prompt
107
+
108
+ system_message, user_message = self._compose_messages(
109
+ module_id=module_id,
110
+ old_prompt=old_prompt,
111
+ feedback_text=feedback_text,
112
+ )
113
+ merged_prompt_text = _compose_prompt_messages(
114
+ system_message, user_message
115
+ )
116
+
117
+ out = await self.optimizer_model.a_generate(merged_prompt_text)
118
+ new_text = _normalize_llm_output_to_text(out)
119
+ return _apply_rewritten_prompt(
120
+ old_prompt,
121
+ new_text,
122
+ self.random_state,
123
+ self.list_mutation_config,
124
+ )
@@ -0,0 +1,214 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import random
4
+ from typing import List, Optional, Tuple, Union
5
+
6
+ from deepeval.errors import DeepEvalError
7
+ from deepeval.optimizer.utils import (
8
+ validate_int_in_range,
9
+ validate_instance,
10
+ )
11
+ from deepeval.optimizer.configs import (
12
+ MutationConfig,
13
+ MutationTargetType,
14
+ )
15
+ from deepeval.prompt.api import PromptType, PromptMessage
16
+ from deepeval.prompt.prompt import Prompt
17
+
18
+
19
+ ##################
20
+ # Common Helpers #
21
+ ##################
22
+ def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str:
23
+ """
24
+ Produce a human-readable summary of the current prompt for the
25
+ rewriter instruction block.
26
+
27
+ - For TEXT prompts, this is just `text_template`.
28
+ - For LIST prompts, this is a numbered list of (role, content) lines.
29
+ """
30
+
31
+ # LIST prompts: show each message with its role.
32
+ if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
33
+ lines: List[str] = []
34
+ for message_index, message in enumerate(old_prompt.messages_template):
35
+ role = message.role or ""
36
+ content = message.content or ""
37
+ lines.append(f"[{message_index+1}] ({role}) {content}")
38
+ combined = "\n".join(lines)
39
+ return combined[:max_chars]
40
+
41
+ # Since it is not a LIST prompt, just use text_template.
42
+ text = old_prompt.text_template or ""
43
+ return text[:max_chars]
44
+
45
+
46
+ def _select_list_target_index(
47
+ messages: List[PromptMessage],
48
+ config: MutationConfig,
49
+ random_state: random.Random,
50
+ ) -> int:
51
+ """
52
+ Select which list message index to rewrite, based on PromptListMutationConfig.
53
+
54
+ Rules:
55
+ - Start with all indices in scope.
56
+ - If target_role is set, restrict candidates to messages with that role
57
+ (case insensitive). If no messages match, fall back to all indices.
58
+ - target_type:
59
+ * FIRST: pick the first candidate index.
60
+ * RANDOM: pick a candidate via random_state.choice(candidates).
61
+ * FIXED_INDEX: use target_index when valid (and consistent with role
62
+ filter), otherwise fall back to the first candidate.
63
+ """
64
+ if not messages:
65
+ raise DeepEvalError(
66
+ "Rewriter._select_list_target_index expected at least one "
67
+ "message, but received an empty message list."
68
+ )
69
+
70
+ validate_instance(
71
+ component="Rewriter._select_list_target_index",
72
+ param_name="target_type",
73
+ value=config.target_type,
74
+ expected_types=MutationTargetType,
75
+ )
76
+
77
+ messages_length = len(messages)
78
+ candidate_indices = list(range(messages_length))
79
+
80
+ # Optional case insensitive role restriction
81
+ if config.target_role:
82
+ target_role_lower = config.target_role.lower()
83
+ filtered = [
84
+ index
85
+ for index, message in enumerate(messages)
86
+ if (message.role or "").lower() == target_role_lower
87
+ ]
88
+ if filtered:
89
+ candidate_indices = filtered
90
+
91
+ target_type = config.target_type
92
+
93
+ if target_type is MutationTargetType.RANDOM:
94
+ return random_state.choice(candidate_indices)
95
+
96
+ if target_type is MutationTargetType.FIXED_INDEX:
97
+ index = validate_int_in_range(
98
+ component="Rewriter._select_list_target_index",
99
+ param_name="target_index",
100
+ value=int(config.target_index),
101
+ min_inclusive=0,
102
+ max_exclusive=len(candidate_indices),
103
+ )
104
+ return candidate_indices[index]
105
+
106
+ # if you got this error it means that a new PromptListMutationTargetType was added,
107
+ # but not handled above
108
+ raise DeepEvalError(
109
+ "Rewriter._select_list_target_index received unsupported "
110
+ f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX."
111
+ )
112
+
113
+
114
+ def _apply_rewritten_prompt(
115
+ old_prompt: Prompt,
116
+ new_text: str,
117
+ random_state: random.Random,
118
+ list_mutation_config: Optional[MutationConfig] = None,
119
+ ) -> Prompt:
120
+ """
121
+ Apply the rewritten text to a Prompt, preserving representation:
122
+
123
+ - For TEXT prompts, update `text_template`.
124
+ - For LIST prompts, rewrite the content of a single message while
125
+ keeping the number of messages the same.
126
+ - Preserve additonal Prompt meta such as `label` and `interpolation_type`
127
+ """
128
+ if not new_text:
129
+ return old_prompt
130
+
131
+ if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
132
+ messages = old_prompt.messages_template
133
+ config = list_mutation_config or MutationConfig()
134
+
135
+ target_index = _select_list_target_index(
136
+ messages=messages,
137
+ config=config,
138
+ random_state=random_state,
139
+ )
140
+
141
+ new_messages: List[PromptMessage] = []
142
+ for message_index, message in enumerate(messages):
143
+ if message_index == target_index:
144
+ # Preserve the original role; do not inject a new one.
145
+ new_messages.append(
146
+ PromptMessage(
147
+ role=message.role,
148
+ content=new_text,
149
+ )
150
+ )
151
+ else:
152
+ new_messages.append(message)
153
+
154
+ new_prompt = Prompt(
155
+ alias=old_prompt.alias,
156
+ text_template=None,
157
+ messages_template=new_messages,
158
+ model_settings=old_prompt.model_settings,
159
+ output_type=old_prompt.output_type,
160
+ output_schema=old_prompt.output_schema,
161
+ )
162
+
163
+ else:
164
+ # Since it is not LIST, it must be TEXT type
165
+ new_prompt = Prompt(
166
+ alias=old_prompt.alias,
167
+ text_template=new_text,
168
+ model_settings=old_prompt.model_settings,
169
+ output_type=old_prompt.output_type,
170
+ output_schema=old_prompt.output_schema,
171
+ )
172
+
173
+ new_prompt.label = old_prompt.label
174
+ new_prompt.interpolation_type = old_prompt.interpolation_type
175
+ return new_prompt
176
+
177
+
178
+ def _compose_prompt_messages(system_message: str, user_message: str) -> str:
179
+ """
180
+ Join system and user messages into a single prompt string.
181
+ Strips surrounding whitespace from each part; if the system message is
182
+ empty or absent, returns just the user message.
183
+ """
184
+ system_text = (system_message or "").strip()
185
+ user_text = (user_message or "").strip()
186
+ return f"{system_text}\n\n{user_text}" if system_text else user_text
187
+
188
+
189
+ def _normalize_llm_output_to_text(
190
+ result: Union[str, Tuple[Union[str, dict], float], dict],
191
+ ) -> str:
192
+ """
193
+ Convert a DeepEval LLM generate() / a_generate() result to a clean string.
194
+
195
+ Accepted inputs:
196
+ - str -> returned as trimmed
197
+ - (str|dict, float_cost) -> first element extracted and normalized
198
+ - dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False
199
+
200
+ Fallback: if serialization fails, str(value).strip() is used.
201
+ """
202
+ output_value: Union[str, dict]
203
+ if isinstance(result, tuple):
204
+ output_value = result[0]
205
+ else:
206
+ output_value = result
207
+
208
+ if isinstance(output_value, str):
209
+ return output_value.strip()
210
+
211
+ try:
212
+ return json.dumps(output_value, ensure_ascii=False)
213
+ except Exception:
214
+ return str(output_value).strip()
@@ -0,0 +1,5 @@
1
+ from .scorer import Scorer
2
+
3
+ __all__ = [
4
+ "Scorer",
5
+ ]
@@ -0,0 +1,86 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union, List
3
+
4
+ from deepeval.optimizer.types import PromptConfiguration, ScoreVector
5
+ from deepeval.dataset.golden import Golden, ConversationalGolden
6
+
7
+ ModuleId = str
8
+
9
+
10
+ class BaseScorer(ABC):
11
+ """
12
+ Base scorer contract used by optimization runners.
13
+
14
+ Runners call into this adapter to:
15
+ - compute scores per-instance on some subset (score_on_pareto),
16
+ - compute minibatch means for selection and acceptance,
17
+ - generate feedback text used by the Rewriter.
18
+ """
19
+
20
+ # Sync
21
+ @abstractmethod
22
+ def score_pareto(
23
+ self,
24
+ prompt_configuration: PromptConfiguration,
25
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
26
+ ) -> ScoreVector:
27
+ """Return per-instance scores on D_pareto."""
28
+ raise NotImplementedError
29
+
30
+ @abstractmethod
31
+ def score_minibatch(
32
+ self,
33
+ prompt_configuration: PromptConfiguration,
34
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
35
+ ) -> float:
36
+ """Return average score μ on a minibatch from D_feedback."""
37
+ raise NotImplementedError
38
+
39
+ @abstractmethod
40
+ def get_minibatch_feedback(
41
+ self,
42
+ prompt_configuration: PromptConfiguration,
43
+ module: ModuleId,
44
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
45
+ ) -> str:
46
+ """Return μ_f text for the module (metric.reason + traces, etc.)."""
47
+ raise NotImplementedError
48
+
49
+ @abstractmethod
50
+ def select_module(
51
+ self, prompt_configuration: PromptConfiguration
52
+ ) -> ModuleId:
53
+ """Pick a module to mutate."""
54
+ raise NotImplementedError
55
+
56
+ # Async
57
+ @abstractmethod
58
+ async def a_score_pareto(
59
+ self,
60
+ prompt_configuration: PromptConfiguration,
61
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
62
+ ) -> ScoreVector:
63
+ raise NotImplementedError
64
+
65
+ @abstractmethod
66
+ async def a_score_minibatch(
67
+ self,
68
+ prompt_configuration: PromptConfiguration,
69
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
70
+ ) -> float:
71
+ raise NotImplementedError
72
+
73
+ @abstractmethod
74
+ async def a_get_minibatch_feedback(
75
+ self,
76
+ prompt_configuration: PromptConfiguration,
77
+ module: ModuleId,
78
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
79
+ ) -> str:
80
+ raise NotImplementedError
81
+
82
+ @abstractmethod
83
+ async def a_select_module(
84
+ self, prompt_configuration: PromptConfiguration
85
+ ) -> ModuleId:
86
+ raise NotImplementedError