deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import copy
4
+ from typing import (
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Union,
10
+ )
11
+
12
+ from deepeval.dataset.golden import Golden, ConversationalGolden
13
+ from deepeval.dataset.utils import (
14
+ convert_goldens_to_test_cases,
15
+ convert_convo_goldens_to_convo_test_cases,
16
+ )
17
+ from deepeval.errors import DeepEvalError
18
+ from deepeval.metrics import (
19
+ BaseMetric,
20
+ BaseConversationalMetric,
21
+ )
22
+ from deepeval.metrics.utils import copy_metrics
23
+ from deepeval.test_case import (
24
+ LLMTestCase,
25
+ ConversationalTestCase,
26
+ Turn,
27
+ )
28
+ from deepeval.prompt.prompt import Prompt
29
+
30
+ from deepeval.optimizer.types import (
31
+ ModelCallback,
32
+ PromptConfiguration,
33
+ Objective,
34
+ MeanObjective,
35
+ ModuleId,
36
+ )
37
+ from deepeval.optimizer.scorer.base import BaseScorer
38
+ from deepeval.optimizer.utils import (
39
+ validate_callback,
40
+ validate_metrics,
41
+ invoke_model_callback,
42
+ a_invoke_model_callback,
43
+ )
44
+ from deepeval.optimizer.scorer.utils import (
45
+ _measure_no_indicator,
46
+ _a_measure_no_indicator,
47
+ )
48
+
49
+
50
+ class Scorer(BaseScorer):
51
+ """
52
+ Scores prompts by running model_callback, building test cases,
53
+ running metrics, and aggregating scores.
54
+ """
55
+
56
+ DEFAULT_MODULE_ID: ModuleId = "__module__"
57
+
58
+ def __init__(
59
+ self,
60
+ model_callback: ModelCallback,
61
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
62
+ max_concurrent: int,
63
+ throttle_seconds: float,
64
+ objective_scalar: Objective = MeanObjective(),
65
+ ):
66
+ self.model_callback = validate_callback(
67
+ component="Scorer",
68
+ model_callback=model_callback,
69
+ )
70
+ self.metrics = validate_metrics(component="Scorer", metrics=metrics)
71
+ self.objective_scalar = objective_scalar
72
+ self._semaphore = asyncio.Semaphore(max_concurrent)
73
+ self._throttle = float(throttle_seconds)
74
+
75
+ ########################
76
+ # generation & scoring #
77
+ ########################
78
+
79
+ def generate(
80
+ self,
81
+ prompts_by_module: Dict[ModuleId, Prompt],
82
+ golden: Union[Golden, ConversationalGolden],
83
+ ) -> str:
84
+ module_id = self._select_module_id_from_prompts(prompts_by_module)
85
+ prompt = prompts_by_module.get(module_id) or next(
86
+ iter(prompts_by_module.values())
87
+ )
88
+
89
+ return invoke_model_callback(
90
+ model_callback=self.model_callback,
91
+ prompt=prompt,
92
+ golden=golden,
93
+ )
94
+
95
+ async def a_generate(
96
+ self,
97
+ prompts_by_module: Dict[ModuleId, Prompt],
98
+ golden: Union[Golden, ConversationalGolden],
99
+ ) -> str:
100
+ module_id = self._select_module_id_from_prompts(prompts_by_module)
101
+ prompt = prompts_by_module.get(module_id) or next(
102
+ iter(prompts_by_module.values())
103
+ )
104
+
105
+ return await a_invoke_model_callback(
106
+ model_callback=self.model_callback,
107
+ prompt=prompt,
108
+ golden=golden,
109
+ )
110
+
111
+ def score_pareto(
112
+ self,
113
+ prompt_configuration: PromptConfiguration,
114
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
115
+ ) -> List[float]:
116
+ return [
117
+ self._score_one(prompt_configuration, golden) for golden in d_pareto
118
+ ]
119
+
120
+ def score_minibatch(
121
+ self,
122
+ prompt_configuration: PromptConfiguration,
123
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
124
+ ) -> float:
125
+ if not minibatch:
126
+ return 0.0
127
+
128
+ scores = [
129
+ self._score_one(prompt_configuration, golden)
130
+ for golden in minibatch
131
+ ]
132
+ return sum(scores) / len(scores)
133
+
134
+ def get_minibatch_feedback(
135
+ self,
136
+ prompt_configuration: PromptConfiguration,
137
+ module: ModuleId,
138
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
139
+ ) -> str:
140
+ # default metric feedback (μ_f): concat metric.reason across minibatch and cap length
141
+ reasons: List[str] = []
142
+ for golden in minibatch:
143
+ actual = self.generate(prompt_configuration.prompts, golden)
144
+ test_case = self._golden_to_test_case(golden, actual)
145
+ for metric in copy_metrics(self.metrics):
146
+ _measure_no_indicator(metric=metric, test_case=test_case)
147
+ if metric.reason:
148
+ reasons.append(str(metric.reason))
149
+ if not reasons:
150
+ return ""
151
+ unique: List[str] = []
152
+ seen = set()
153
+ for reason in reasons:
154
+ if reason not in seen:
155
+ unique.append(reason)
156
+ seen.add(reason)
157
+ return "\n---\n".join(
158
+ unique[:8]
159
+ ) # TODO: Make how much feedback configurable
160
+
161
+ async def a_score_pareto(
162
+ self,
163
+ prompt_configuration: PromptConfiguration,
164
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
165
+ ) -> List[float]:
166
+ tasks = [
167
+ self._bounded(self._a_score_one(prompt_configuration, golden))
168
+ for golden in d_pareto
169
+ ]
170
+ return await asyncio.gather(*tasks)
171
+
172
+ async def a_score_minibatch(
173
+ self,
174
+ prompt_configuration: PromptConfiguration,
175
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
176
+ ) -> float:
177
+ tasks = [
178
+ self._bounded(self._a_score_one(prompt_configuration, golden))
179
+ for golden in minibatch
180
+ ]
181
+ scores = await asyncio.gather(*tasks)
182
+ return sum(scores) / len(scores) if scores else 0.0
183
+
184
+ async def a_get_minibatch_feedback(
185
+ self,
186
+ prompt_configuration: PromptConfiguration,
187
+ module: ModuleId,
188
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
189
+ ) -> str:
190
+ async def reasons_one(golden) -> List[str]:
191
+ # Clone per task to avoid shared state
192
+ metrics = copy_metrics(self.metrics)
193
+ # metrics = self.metrics
194
+ actual = await self.a_generate(prompt_configuration.prompts, golden)
195
+ test_case = self._golden_to_test_case(golden, actual)
196
+ out: List[str] = []
197
+ for metric in metrics:
198
+ await _a_measure_no_indicator(metric, test_case)
199
+ if metric.reason:
200
+ out.append(str(metric.reason))
201
+ return out
202
+
203
+ tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
204
+ nested = await asyncio.gather(*tasks)
205
+ reasons: List[str] = [reason for sub in nested for reason in sub]
206
+ if not reasons:
207
+ return ""
208
+ unique: List[str] = []
209
+ seen = set()
210
+ for reason in reasons:
211
+ if reason not in seen:
212
+ unique.append(reason)
213
+ seen.add(reason)
214
+ return "\n---\n".join(unique[:8])
215
+
216
+ ###################
217
+ # scoring helpers #
218
+ ###################
219
+
220
+ def _golden_to_test_case(
221
+ self,
222
+ golden: Union[Golden, ConversationalGolden],
223
+ actual: str,
224
+ ) -> Union[LLMTestCase, ConversationalTestCase]:
225
+ """Convert a golden + actual output into a test case for metrics."""
226
+ if isinstance(golden, Golden):
227
+ golden.actual_output = actual
228
+ return convert_goldens_to_test_cases([golden])[0]
229
+
230
+ if isinstance(golden, ConversationalGolden):
231
+ # Build turns with actual output as assistant response
232
+ turns: List[Turn] = list(golden.turns or [])
233
+ if turns and turns[-1].role == "assistant":
234
+ turns[-1] = Turn(role="assistant", content=actual)
235
+ elif turns:
236
+ turns.append(Turn(role="assistant", content=actual))
237
+ else:
238
+ turns = [
239
+ Turn(role="assistant", content=actual),
240
+ ]
241
+
242
+ golden.turns = turns
243
+ return convert_convo_goldens_to_convo_test_cases([golden])[0]
244
+
245
+ async def _bounded(self, coro):
246
+ if self._semaphore is None:
247
+ return await coro
248
+ async with self._semaphore:
249
+ res = await coro
250
+ if self._throttle:
251
+ await asyncio.sleep(self._throttle)
252
+ return res
253
+
254
+ async def _a_score_one(
255
+ self,
256
+ prompt_configuration: PromptConfiguration,
257
+ golden: Union[Golden, ConversationalGolden],
258
+ ) -> float:
259
+ # Clone metrics to avoid shared-state
260
+ metrics = copy_metrics(self.metrics)
261
+ actual = await self.a_generate(prompt_configuration.prompts, golden)
262
+ test_case = self._golden_to_test_case(golden, actual)
263
+
264
+ per_metric: Dict[str, float] = {}
265
+ for metric in metrics:
266
+ score = await _a_measure_no_indicator(metric, test_case)
267
+ per_metric[metric.__class__.__name__] = float(score)
268
+ return self.objective_scalar.scalarize(per_metric)
269
+
270
+ def _score_one(
271
+ self,
272
+ prompt_configuration: PromptConfiguration,
273
+ golden: Union[Golden, ConversationalGolden],
274
+ ) -> float:
275
+ metrics = copy_metrics(self.metrics)
276
+ actual = self.generate(prompt_configuration.prompts, golden)
277
+ test_case = self._golden_to_test_case(golden, actual)
278
+
279
+ per_metric: Dict[str, float] = {}
280
+ for metric in metrics:
281
+ score = _measure_no_indicator(metric, test_case)
282
+ per_metric[metric.__class__.__name__] = float(score)
283
+ return self.objective_scalar.scalarize(per_metric)
284
+
285
+ def _select_module_id_from_prompts(
286
+ self, prompts_by_module: Dict[ModuleId, Prompt]
287
+ ) -> ModuleId:
288
+ """
289
+ Default module selection strategy:
290
+
291
+ - Prefer the synthetic '__module__' key when present
292
+ - Otherwise fall back to the first key in prompts_by_module.
293
+
294
+ Assumes `prompts_by_module` is non-empty; callers should validate that.
295
+ """
296
+ if self.DEFAULT_MODULE_ID in prompts_by_module:
297
+ return self.DEFAULT_MODULE_ID
298
+
299
+ # At this point we expect at least one key.
300
+ try:
301
+ return next(iter(prompts_by_module.keys()))
302
+ except StopIteration:
303
+ raise DeepEvalError(
304
+ "Scorer._select_module_id_from_prompts(...) "
305
+ "received an empty `prompts_by_module`. At least one Prompt is required."
306
+ )
307
+
308
+ def select_module(
309
+ self, prompt_configuration: PromptConfiguration
310
+ ) -> ModuleId:
311
+ return self._select_module_id_from_prompts(prompt_configuration.prompts)
312
+
313
+ async def a_select_module(
314
+ self, prompt_configuration: PromptConfiguration
315
+ ) -> ModuleId:
316
+ return self.select_module(prompt_configuration)
@@ -0,0 +1,30 @@
1
+ import inspect
2
+ from typing import Callable, Union
3
+
4
+ from deepeval.metrics import BaseConversationalMetric, BaseMetric
5
+ from deepeval.test_case import ConversationalTestCase, LLMTestCase
6
+
7
+
8
+ def _build_measure_kwargs(func: Callable) -> dict:
9
+ params = inspect.signature(func).parameters
10
+ kwargs = {}
11
+ for key in ("_show_indicator", "_in_component", "_log_metric_to_confident"):
12
+ if key in params:
13
+ kwargs[key] = False
14
+ return kwargs
15
+
16
+
17
+ def _measure_no_indicator(
18
+ metric: Union[BaseMetric, BaseConversationalMetric],
19
+ test_case: Union[LLMTestCase, ConversationalTestCase],
20
+ ):
21
+ kwargs = _build_measure_kwargs(metric.measure)
22
+ return metric.measure(test_case, **kwargs)
23
+
24
+
25
+ async def _a_measure_no_indicator(
26
+ metric: Union[BaseMetric, BaseConversationalMetric],
27
+ test_case: Union[LLMTestCase, ConversationalTestCase],
28
+ ):
29
+ kwargs = _build_measure_kwargs(metric.a_measure)
30
+ return await metric.a_measure(test_case, **kwargs)
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+ from abc import ABC, abstractmethod
4
+
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Optional,
11
+ TypedDict,
12
+ TYPE_CHECKING,
13
+ Union,
14
+ )
15
+ from enum import Enum
16
+ from pydantic import BaseModel, ConfigDict
17
+
18
+ from deepeval.prompt.prompt import Prompt
19
+
20
+ if TYPE_CHECKING:
21
+ from deepeval.dataset.golden import Golden, ConversationalGolden
22
+
23
+ PromptConfigurationId = str
24
+ ModuleId = str
25
+ ScoreVector = List[float] # scores per instance on D_pareto, aligned order
26
+ ScoreTable = Dict[PromptConfigurationId, ScoreVector]
27
+
28
+ # Type alias for model callback function
29
+ ModelCallback = Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]
30
+
31
+
32
+ @dataclass
33
+ class PromptConfiguration:
34
+ id: PromptConfigurationId
35
+ parent: Optional[PromptConfigurationId]
36
+ prompts: Dict[ModuleId, Prompt]
37
+
38
+ @staticmethod
39
+ def new(
40
+ prompts: Dict[ModuleId, Prompt],
41
+ parent: Optional[PromptConfigurationId] = None,
42
+ ) -> "PromptConfiguration":
43
+ return PromptConfiguration(
44
+ id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
45
+ )
46
+
47
+
48
+ class RunnerStatusType(str, Enum):
49
+ """Status events emitted by optimization runners."""
50
+
51
+ PROGRESS = "progress"
52
+ TIE = "tie"
53
+ ERROR = "error"
54
+
55
+
56
+ # Type alias for status callback function
57
+ RunnerStatusCallback = Callable[..., None]
58
+
59
+
60
+ class Objective(ABC):
61
+ """Strategy for reducing scores per-metric to a single scalar value.
62
+
63
+ Implementations receive a mapping from metric name to score
64
+ (for example, {"AnswerRelevancyMetric": 0.82}) and return a
65
+ single float used for comparisons inside the optimizer.
66
+ """
67
+
68
+ @abstractmethod
69
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
70
+ raise NotImplementedError
71
+
72
+
73
+ class MeanObjective(Objective):
74
+ """Default scalarizer: unweighted arithmetic mean.
75
+
76
+ - If `scores_by_metric` is non-empty, returns the arithmetic
77
+ mean of all metric scores.
78
+ - If `scores_by_metric` is empty, returns 0.0.
79
+ """
80
+
81
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
82
+ if not scores_by_metric:
83
+ return 0.0
84
+ return sum(scores_by_metric.values()) / len(scores_by_metric)
85
+
86
+
87
+ class WeightedObjective(Objective):
88
+ """
89
+ Objective that scales each metric's score by a user-provided weight and sums them.
90
+
91
+ - `weights_by_metric` keys should match the names of the metrics passed to the
92
+ metric class names passed to the PromptOptimizer.
93
+ - Metrics not present in `weights_by_metric` receive `default_weight`.
94
+ This makes it easy to emphasize a subset of metrics while keeping
95
+ everything else at a baseline weight of 1.0, e.g.:
96
+
97
+ WeightedObjective({"AnswerRelevancyMetric": 2.0})
98
+
99
+ which treats AnswerRelevancy as 2x as important as the other metrics.
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ weights_by_metric: Optional[Dict[str, float]] = None,
105
+ default_weight: float = 1.0,
106
+ ):
107
+ self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
108
+ self.default_weight: float = float(default_weight)
109
+
110
+ def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
111
+ return sum(
112
+ self.weights_by_metric.get(name, self.default_weight) * score
113
+ for name, score in scores_by_metric.items()
114
+ )
115
+
116
+
117
+ class AcceptedIterationDict(TypedDict):
118
+ parent: PromptConfigurationId
119
+ child: PromptConfigurationId
120
+ module: ModuleId
121
+ before: float
122
+ after: float
123
+
124
+
125
+ class AcceptedIteration(BaseModel):
126
+ parent: str
127
+ child: str
128
+ module: str
129
+ before: float
130
+ after: float
131
+
132
+
133
+ class PromptConfigSnapshot(BaseModel):
134
+ model_config = ConfigDict(arbitrary_types_allowed=True)
135
+
136
+ parent: Optional[str]
137
+ prompts: Dict[str, Prompt]
138
+
139
+
140
+ class OptimizationReport(BaseModel):
141
+ model_config = ConfigDict(arbitrary_types_allowed=True)
142
+
143
+ optimization_id: str
144
+ best_id: str
145
+ accepted_iterations: List[AcceptedIteration]
146
+ pareto_scores: Dict[str, List[float]]
147
+ parents: Dict[str, Optional[str]]
148
+ prompt_configurations: Dict[str, PromptConfigSnapshot]