deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,361 +0,0 @@
1
- from __future__ import annotations
2
- import uuid
3
-
4
- from dataclasses import dataclass
5
- from typing import (
6
- Any,
7
- Callable,
8
- Dict,
9
- List,
10
- Literal,
11
- Optional,
12
- Protocol,
13
- TYPE_CHECKING,
14
- TypedDict,
15
- Tuple,
16
- Union,
17
- )
18
- from enum import Enum
19
- from pydantic import BaseModel as PydanticBaseModel, Field, AliasChoices
20
-
21
- from deepeval.prompt.prompt import Prompt
22
- from deepeval.models.base_model import DeepEvalBaseLLM
23
-
24
-
25
- if TYPE_CHECKING:
26
- from deepeval.dataset.golden import Golden, ConversationalGolden
27
-
28
- PromptConfigurationId = str
29
- ModuleId = str
30
- ScoreVector = List[float] # scores per instance on D_pareto, aligned order
31
- ScoreTable = Dict[PromptConfigurationId, ScoreVector]
32
-
33
-
34
- @dataclass
35
- class PromptConfiguration:
36
- id: PromptConfigurationId
37
- parent: Optional[PromptConfigurationId]
38
- prompts: Dict[ModuleId, Prompt]
39
-
40
- @staticmethod
41
- def new(
42
- prompts: Dict[ModuleId, Prompt],
43
- parent: Optional[PromptConfigurationId] = None,
44
- ) -> "PromptConfiguration":
45
- return PromptConfiguration(
46
- id=str(uuid.uuid4()), parent=parent, prompts=dict(prompts)
47
- )
48
-
49
-
50
- class ScoringAdapter(Protocol):
51
- """
52
- Scoring adapter contract used by optimization runners.
53
-
54
- Runners call into this adapter to:
55
- - compute scores per-instance on some subset (score_on_pareto),
56
- - compute minibatch means for selection and acceptance,
57
- - generate feedback text used by the PromptRewriter.
58
- """
59
-
60
- # Sync
61
- def score_on_pareto(
62
- self,
63
- prompt_configuration: PromptConfiguration,
64
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
65
- ) -> ScoreVector:
66
- """Return per-instance scores on D_pareto."""
67
- ...
68
-
69
- def minibatch_score(
70
- self,
71
- prompt_configuration: PromptConfiguration,
72
- minibatch: Union[List[Golden], List[ConversationalGolden]],
73
- ) -> float:
74
- """Return average score μ on a minibatch from D_feedback."""
75
- ...
76
-
77
- def minibatch_feedback(
78
- self,
79
- prompt_configuration: PromptConfiguration,
80
- module: ModuleId,
81
- minibatch: Union[List[Golden], List[ConversationalGolden]],
82
- ) -> str:
83
- """Return μ_f text for the module (metric.reason + traces, etc.)."""
84
- ...
85
-
86
- def select_module(
87
- self, prompt_configuration: PromptConfiguration
88
- ) -> ModuleId:
89
- """Pick a module to mutate."""
90
- ...
91
-
92
- # Async
93
- async def a_score_on_pareto(
94
- self,
95
- prompt_configuration: PromptConfiguration,
96
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
97
- ) -> ScoreVector: ...
98
- async def a_minibatch_score(
99
- self,
100
- prompt_configuration: PromptConfiguration,
101
- minibatch: Union[List[Golden], List[ConversationalGolden]],
102
- ) -> float: ...
103
- async def a_minibatch_feedback(
104
- self,
105
- prompt_configuration: PromptConfiguration,
106
- module: ModuleId,
107
- minibatch: Union[List[Golden], List[ConversationalGolden]],
108
- ) -> str: ...
109
- async def a_select_module(
110
- self, prompt_configuration: PromptConfiguration
111
- ) -> ModuleId: ...
112
-
113
-
114
- class PromptRewriterProtocol(Protocol):
115
- def rewrite(
116
- self,
117
- *,
118
- module_id: ModuleId,
119
- model: Optional[DeepEvalBaseLLM] = None,
120
- model_schema: Optional[PydanticBaseModel] = None,
121
- model_callback: Optional[
122
- Callable[
123
- ...,
124
- Union[
125
- str,
126
- Dict,
127
- Tuple[Union[str, Dict], float],
128
- ],
129
- ]
130
- ] = None,
131
- old_prompt: Prompt,
132
- feedback_text: str,
133
- ) -> Prompt: ...
134
-
135
- async def a_rewrite(
136
- self,
137
- *,
138
- module_id: ModuleId,
139
- model: Optional[DeepEvalBaseLLM] = None,
140
- model_schema: Optional[PydanticBaseModel] = None,
141
- model_callback: Optional[
142
- Callable[
143
- ...,
144
- Union[
145
- str,
146
- Dict,
147
- Tuple[Union[str, Dict], float],
148
- ],
149
- ]
150
- ] = None,
151
- old_prompt: Prompt,
152
- feedback_text: str,
153
- ) -> Prompt: ...
154
-
155
-
156
- class RunnerStatusType(str, Enum):
157
- """Status events emitted by optimization runners."""
158
-
159
- PROGRESS = "progress"
160
- TIE = "tie"
161
- ERROR = "error"
162
-
163
-
164
- class RunnerStatusCallbackProtocol(Protocol):
165
- def __call__(
166
- self,
167
- kind: RunnerStatusType,
168
- *,
169
- detail: str,
170
- step_index: Optional[int] = None,
171
- total_steps: Optional[int] = None,
172
- ) -> None: ...
173
-
174
-
175
- class RunnerProtocol(Protocol):
176
- """
177
- Contract for prompt optimization runners used by PromptOptimizer.
178
-
179
- Runners are responsible for executing the optimization algorithm
180
- and returning an optimized Prompt plus a report dict.
181
- """
182
-
183
- # status_callback is injected by PromptOptimizer
184
- # A runner may call this to report:
185
- # progress, ties, or errors during execution.
186
- status_callback: Optional[RunnerStatusCallbackProtocol]
187
- model_callback: Optional[
188
- Callable[
189
- ...,
190
- Union[
191
- str,
192
- Dict,
193
- Tuple[Union[str, Dict], float],
194
- ],
195
- ]
196
- ]
197
-
198
- scoring_adapter: Optional[ScoringAdapter]
199
-
200
- def execute(
201
- self,
202
- *,
203
- prompt: Prompt,
204
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
205
- ) -> Tuple[Prompt, Dict]: ...
206
-
207
- async def a_execute(
208
- self,
209
- *,
210
- prompt: Prompt,
211
- goldens: Union[List["Golden"], List["ConversationalGolden"]],
212
- ) -> Tuple[Prompt, Dict]: ...
213
-
214
-
215
- class Objective(Protocol):
216
- """Strategy for reducing scores per-metric to a single scalar value.
217
-
218
- Implementations receive a mapping from metric name to score
219
- (for example, {"AnswerRelevancyMetric": 0.82}) and return a
220
- single float used for comparisons inside the optimizer.
221
- """
222
-
223
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float: ...
224
-
225
-
226
- class MeanObjective(Objective):
227
- """Default scalarizer: unweighted arithmetic mean.
228
-
229
- - If `scores_by_metric` is non-empty, returns the arithmetic
230
- mean of all metric scores.
231
- - If `scores_by_metric` is empty, returns 0.0.
232
- """
233
-
234
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
235
- if not scores_by_metric:
236
- return 0.0
237
- return sum(scores_by_metric.values()) / len(scores_by_metric)
238
-
239
-
240
- class WeightedObjective(Objective):
241
- """
242
- Objective that scales each metric's score by a user-provided weight and sums them.
243
-
244
- - `weights_by_metric` keys should match the names of the metrics passed to the
245
- metric class names passed to the PromptOptimizer.
246
- - Metrics not present in `weights_by_metric` receive `default_weight`.
247
- This makes it easy to emphasize a subset of metrics while keeping
248
- everything else at a baseline weight of 1.0, e.g.:
249
-
250
- WeightedObjective({"AnswerRelevancyMetric": 2.0})
251
-
252
- which treats AnswerRelevancy as 2x as important as the other metrics.
253
- """
254
-
255
- def __init__(
256
- self,
257
- weights_by_metric: Optional[Dict[str, float]] = None,
258
- default_weight: float = 1.0,
259
- ):
260
- self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {})
261
- self.default_weight: float = float(default_weight)
262
-
263
- def scalarize(self, scores_by_metric: Dict[str, float]) -> float:
264
- return sum(
265
- self.weights_by_metric.get(name, self.default_weight) * score
266
- for name, score in scores_by_metric.items()
267
- )
268
-
269
-
270
- @dataclass
271
- class MetricInfo:
272
- name: str
273
- rubric: Optional[str] = None
274
-
275
-
276
- class AcceptedIterationDict(TypedDict):
277
- parent: PromptConfigurationId
278
- child: PromptConfigurationId
279
- module: ModuleId
280
- before: float
281
- after: float
282
-
283
-
284
- class AcceptedIteration(PydanticBaseModel):
285
- parent: str
286
- child: str
287
- module: str
288
- before: float
289
- after: float
290
-
291
-
292
- class PromptMessageSnapshot(PydanticBaseModel):
293
- role: str
294
- content: str
295
-
296
-
297
- class PromptModuleSnapshot(PydanticBaseModel):
298
- type: Literal["TEXT", "LIST"]
299
- # Only used when type == "TEXT"
300
- text_template: Optional[str] = None
301
- # Only used when type == "LIST"
302
- messages: Optional[List[PromptMessageSnapshot]] = None
303
-
304
-
305
- class PromptConfigSnapshot(PydanticBaseModel):
306
- parent: Optional[str]
307
- prompts: Dict[str, PromptModuleSnapshot]
308
-
309
-
310
- @dataclass
311
- class OptimizationResult:
312
- optimization_id: str
313
- best_id: PromptConfigurationId
314
- accepted_iterations: List[Dict]
315
- pareto_scores: Dict[PromptConfigurationId, List[float]]
316
- parents: Dict[PromptConfigurationId, Optional[PromptConfigurationId]]
317
- prompt_configurations: Dict[PromptConfigurationId, Dict[str, Any]]
318
-
319
- def as_dict(self) -> Dict:
320
- return dict(
321
- optimization_id=self.optimization_id,
322
- best_id=self.best_id,
323
- accepted_iterations=self.accepted_iterations,
324
- pareto_scores=self.pareto_scores,
325
- parents=self.parents,
326
- prompt_configurations=self.prompt_configurations,
327
- )
328
-
329
-
330
- class OptimizationReport(PydanticBaseModel):
331
- optimization_id: str = Field(
332
- alias="optimizationId",
333
- validation_alias=AliasChoices("optimizationId", "optimization_id"),
334
- )
335
- best_id: str = Field(
336
- alias="bestId",
337
- validation_alias=AliasChoices("bestId", "best_id"),
338
- )
339
- accepted_iterations: list[AcceptedIteration] = Field(
340
- default_factory=list,
341
- alias="acceptedIterations",
342
- validation_alias=AliasChoices(
343
- "acceptedIterations", "accepted_iterations"
344
- ),
345
- )
346
- pareto_scores: dict[str, list[float]] = Field(
347
- alias="paretoScores",
348
- validation_alias=AliasChoices("paretoScores", "pareto_scores"),
349
- )
350
- parents: dict[str, str | None]
351
- prompt_configurations: dict[str, PromptConfigSnapshot] = Field(
352
- alias="promptConfigurations",
353
- validation_alias=AliasChoices(
354
- "promptConfigurations", "prompt_configurations"
355
- ),
356
- )
357
-
358
- @classmethod
359
- def from_runtime(cls, result: dict) -> "OptimizationReport":
360
- # accepts the dict from OptimizationResult.as_dict()
361
- return cls(**result)
@@ -1,170 +0,0 @@
1
- from typing import List, Optional, Dict, Union
2
- from urllib.parse import urlparse, unquote
3
- from dataclasses import dataclass, field
4
- from enum import Enum
5
- import mimetypes
6
- import base64
7
- import os
8
-
9
- from deepeval.test_case import ToolCall
10
-
11
-
12
- @dataclass
13
- class MLLMImage:
14
- dataBase64: Optional[str] = None
15
- mimeType: Optional[str] = None
16
- url: Optional[str] = None
17
- local: Optional[bool] = None
18
- filename: Optional[str] = None
19
-
20
- def __post_init__(self):
21
-
22
- if self.url and self.dataBase64:
23
- raise ValueError(
24
- "You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
25
- )
26
-
27
- if not self.url and not self.dataBase64:
28
- raise ValueError(
29
- "You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
30
- )
31
-
32
- if self.dataBase64 is not None:
33
- if self.mimeType is None:
34
- raise ValueError(
35
- "mimeType must be provided when initializing from Base64 data."
36
- )
37
- else:
38
- is_local = self.is_local_path(self.url)
39
- if self.local is not None:
40
- assert self.local == is_local, "Local path mismatch"
41
- else:
42
- self.local = is_local
43
-
44
- # compute filename, mime_type, and Base64 data
45
- if self.local:
46
- path = self.process_url(self.url)
47
- self.filename = os.path.basename(path)
48
- self.mimeType = (
49
- mimetypes.guess_type(path)[0] or "application/octet-stream"
50
- )
51
- with open(path, "rb") as f:
52
- raw = f.read()
53
- self.dataBase64 = base64.b64encode(raw).decode("ascii")
54
- else:
55
- self.filename = None
56
- self.mimeType = None
57
- self.dataBase64 = None
58
-
59
- @staticmethod
60
- def process_url(url: str) -> str:
61
- if os.path.exists(url):
62
- return url
63
- parsed = urlparse(url)
64
- if parsed.scheme == "file":
65
- raw_path = (
66
- f"//{parsed.netloc}{parsed.path}"
67
- if parsed.netloc
68
- else parsed.path
69
- )
70
- path = unquote(raw_path)
71
- return path
72
- return url
73
-
74
- @staticmethod
75
- def is_local_path(url: str) -> bool:
76
- if os.path.exists(url):
77
- return True
78
- parsed = urlparse(url)
79
- if parsed.scheme == "file":
80
- raw_path = (
81
- f"//{parsed.netloc}{parsed.path}"
82
- if parsed.netloc
83
- else parsed.path
84
- )
85
- path = unquote(raw_path)
86
- return os.path.exists(path)
87
- return False
88
-
89
- def as_data_uri(self) -> Optional[str]:
90
- """Return the image as a data URI string, if Base64 data is available."""
91
- if not self.dataBase64 or not self.mimeType:
92
- return None
93
- return f"data:{self.mimeType};base64,{self.dataBase64}"
94
-
95
-
96
- class MLLMTestCaseParams(Enum):
97
- INPUT = "input"
98
- ACTUAL_OUTPUT = "actual_output"
99
- EXPECTED_OUTPUT = "expected_output"
100
- CONTEXT = "context"
101
- RETRIEVAL_CONTEXT = "retrieval_context"
102
- TOOLS_CALLED = "tools_called"
103
- EXPECTED_TOOLS = "expected_tools"
104
-
105
-
106
- @dataclass
107
- class MLLMTestCase:
108
- input: List[Union[str, MLLMImage]]
109
- actual_output: List[Union[str, MLLMImage]]
110
- expected_output: Optional[List[Union[str, MLLMImage]]] = None
111
- context: Optional[List[Union[str, MLLMImage]]] = None
112
- retrieval_context: Optional[List[Union[str, MLLMImage]]] = None
113
- additional_metadata: Optional[Dict] = None
114
- comments: Optional[str] = None
115
- tools_called: Optional[List[ToolCall]] = None
116
- expected_tools: Optional[List[ToolCall]] = None
117
- token_cost: Optional[float] = None
118
- completion_time: Optional[float] = None
119
- name: Optional[str] = field(default=None)
120
- _dataset_rank: Optional[int] = field(default=None, repr=False)
121
- _dataset_alias: Optional[str] = field(default=None, repr=False)
122
- _dataset_id: Optional[str] = field(default=None, repr=False)
123
-
124
- def __post_init__(self):
125
- # Ensure `expected_output` is None or a list of strings or MLLMImage instances
126
- if self.expected_output is not None:
127
- if not isinstance(self.expected_output, list) or not all(
128
- isinstance(item, (str, MLLMImage))
129
- for item in self.expected_output
130
- ):
131
- raise TypeError(
132
- "'expected_output' must be None or a list of strings or MLLMImage instances"
133
- )
134
-
135
- # Ensure `context` is None or a list of strings or MLLMImage instances
136
- if self.context is not None:
137
- if not isinstance(self.context, list) or not all(
138
- isinstance(item, (str, MLLMImage)) for item in self.context
139
- ):
140
- raise TypeError(
141
- "'context' must be None or a list of strings or MLLMImage instances"
142
- )
143
-
144
- # Ensure `retrieval_context` is None or a list of strings or MLLMImage instances
145
- if self.retrieval_context is not None:
146
- if not isinstance(self.retrieval_context, list) or not all(
147
- isinstance(item, (str, MLLMImage))
148
- for item in self.retrieval_context
149
- ):
150
- raise TypeError(
151
- "'retrieval_context' must be None or a list of strings or MLLMImage instances"
152
- )
153
-
154
- # Ensure `tools_called` is None or a list of strings
155
- if self.tools_called is not None:
156
- if not isinstance(self.tools_called, list) or not all(
157
- isinstance(item, ToolCall) for item in self.tools_called
158
- ):
159
- raise TypeError(
160
- "'tools_called' must be None or a list of `ToolCall`"
161
- )
162
-
163
- # Ensure `expected_tools` is None or a list of strings
164
- if self.expected_tools is not None:
165
- if not isinstance(self.expected_tools, list) or not all(
166
- isinstance(item, ToolCall) for item in self.expected_tools
167
- ):
168
- raise TypeError(
169
- "'expected_tools' must be None or a list of `ToolCall`"
170
- )