deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,309 +0,0 @@
1
- import base64
2
- from typing import Optional, Tuple, List, Union, Dict
3
- from openai import OpenAI, AsyncOpenAI
4
- from openai.types.chat import ParsedChatCompletion
5
- from pydantic import BaseModel, SecretStr
6
- from io import BytesIO
7
-
8
- from deepeval.config.settings import get_settings
9
- from deepeval.models.llms.openai_model import (
10
- model_pricing,
11
- structured_outputs_models,
12
- _request_timeout_seconds,
13
- )
14
- from deepeval.models import DeepEvalBaseMLLM
15
- from deepeval.models.llms.utils import trim_and_load_json
16
- from deepeval.test_case import MLLMImage
17
- from deepeval.models.utils import parse_model_name, require_secret_api_key
18
- from deepeval.models.retry_policy import (
19
- create_retry_decorator,
20
- sdk_retries_for,
21
- )
22
- from deepeval.constants import ProviderSlug as PS
23
-
24
-
25
- retry_openai = create_retry_decorator(PS.OPENAI)
26
-
27
- valid_multimodal_gpt_models = [
28
- "gpt-4o",
29
- "gpt-4o-2024-05-13",
30
- "gpt-4o-2024-08-06",
31
- "gpt-4o-2024-11-20",
32
- "gpt-4o-mini",
33
- "gpt-4o-mini-2024-07-18",
34
- "gpt-4.1",
35
- "gpt-4.1-mini",
36
- "gpt-4.1-nano",
37
- "o1",
38
- "o1-preview",
39
- "o1-2024-12-17",
40
- "o1-preview-2024-09-12",
41
- "gpt-4.5-preview-2025-02-27",
42
- "o4-mini",
43
- ]
44
-
45
- default_multimodal_gpt_model = "gpt-4.1"
46
-
47
- unsupported_log_probs_multimodal_gpt_models = [
48
- "o1",
49
- "o1-preview",
50
- "o1-2024-12-17",
51
- "o1-preview-2024-09-12",
52
- "gpt-4.5-preview-2025-02-27",
53
- "o4-mini",
54
- ]
55
-
56
-
57
- class MultimodalOpenAIModel(DeepEvalBaseMLLM):
58
- def __init__(
59
- self,
60
- model: Optional[str] = None,
61
- _openai_api_key: Optional[str] = None,
62
- *args,
63
- **kwargs,
64
- ):
65
- settings = get_settings()
66
- model_name = None
67
- if isinstance(model, str):
68
- model_name = parse_model_name(model)
69
- if model_name not in valid_multimodal_gpt_models:
70
- raise ValueError(
71
- f"Invalid model. Available Multimodal GPT models: "
72
- f"{', '.join(model for model in valid_multimodal_gpt_models)}"
73
- )
74
- elif settings.OPENAI_MODEL_NAME is not None:
75
- model_name = settings.OPENAI_MODEL_NAME
76
- elif model is None:
77
- model_name = default_multimodal_gpt_model
78
-
79
- if _openai_api_key is not None:
80
- # keep it secret, keep it safe from serializings, logging and aolike
81
- self._openai_api_key: SecretStr | None = SecretStr(_openai_api_key)
82
- else:
83
- self._openai_api_key = settings.OPENAI_API_KEY
84
-
85
- self.args = args
86
- self.kwargs = kwargs
87
-
88
- super().__init__(model_name, *args, **kwargs)
89
-
90
- ###############################################
91
- # Generate functions
92
- ###############################################
93
-
94
- @retry_openai
95
- def generate(
96
- self,
97
- multimodal_input: List[Union[str, MLLMImage]],
98
- schema: Optional[BaseModel] = None,
99
- ) -> Tuple[str, float]:
100
- client = self.load_model(async_mode=False)
101
- prompt = self.generate_prompt(multimodal_input)
102
-
103
- if schema:
104
- if self.model_name in structured_outputs_models:
105
- messages = [{"role": "user", "content": prompt}]
106
- response = client.beta.chat.completions.parse(
107
- model=self.model_name,
108
- messages=messages,
109
- response_format=schema,
110
- )
111
- input_tokens = response.usage.prompt_tokens
112
- output_tokens = response.usage.completion_tokens
113
- total_cost = self.calculate_cost(input_tokens, output_tokens)
114
- generated_text = response.choices[0].message.parsed
115
- return generated_text, total_cost
116
-
117
- completion = client.chat.completions.create(
118
- model=self.model_name,
119
- messages=[{"role": "user", "content": prompt}],
120
- )
121
- output = completion.choices[0].message.content
122
- cost = self.calculate_cost(
123
- completion.usage.prompt_tokens, completion.usage.completion_tokens
124
- )
125
- if schema:
126
- json_output = trim_and_load_json(output)
127
- return schema.model_validate(json_output), cost
128
- else:
129
- return output, cost
130
-
131
- @retry_openai
132
- async def a_generate(
133
- self,
134
- multimodal_input: List[Union[str, MLLMImage]],
135
- schema: Optional[BaseModel] = None,
136
- ) -> Tuple[str, float]:
137
- client = self.load_model(async_mode=True)
138
- prompt = self.generate_prompt(multimodal_input)
139
-
140
- if schema:
141
- if self.model_name in structured_outputs_models:
142
- messages = [{"role": "user", "content": prompt}]
143
- response = await client.beta.chat.completions.parse(
144
- model=self.model_name,
145
- messages=messages,
146
- response_format=schema,
147
- )
148
- input_tokens = response.usage.prompt_tokens
149
- output_tokens = response.usage.completion_tokens
150
- total_cost = self.calculate_cost(input_tokens, output_tokens)
151
- generated_text = response.choices[0].message.parsed
152
- return generated_text, total_cost
153
-
154
- completion = await client.chat.completions.create(
155
- model=self.model_name,
156
- messages=[{"role": "user", "content": prompt}],
157
- )
158
- output = completion.choices[0].message.content
159
- cost = self.calculate_cost(
160
- completion.usage.prompt_tokens, completion.usage.completion_tokens
161
- )
162
- if schema:
163
- json_output = trim_and_load_json(output)
164
- return schema.model_validate(json_output), cost
165
- else:
166
- return output, cost
167
-
168
- ###############################################
169
- # Other generate functions
170
- ###############################################
171
-
172
- @retry_openai
173
- def generate_raw_response(
174
- self,
175
- multimodal_input: List[Union[str, MLLMImage]],
176
- top_logprobs: int = 5,
177
- ) -> Tuple[ParsedChatCompletion, float]:
178
- client = self._client()
179
- prompt = self.generate_prompt(multimodal_input)
180
- messages = [{"role": "user", "content": prompt}]
181
- completion = client.chat.completions.create(
182
- model=self.model_name,
183
- messages=messages,
184
- logprobs=True,
185
- top_logprobs=top_logprobs,
186
- )
187
- # Cost calculation
188
- input_tokens = completion.usage.prompt_tokens
189
- output_tokens = completion.usage.completion_tokens
190
- cost = self.calculate_cost(input_tokens, output_tokens)
191
- return completion, cost
192
-
193
- @retry_openai
194
- async def a_generate_raw_response(
195
- self,
196
- multimodal_input: List[Union[str, MLLMImage]],
197
- top_logprobs: int = 5,
198
- ) -> Tuple[ParsedChatCompletion, float]:
199
- client = self._client(async_mode=True)
200
- prompt = self.generate_prompt(multimodal_input)
201
- messages = [{"role": "user", "content": prompt}]
202
- completion = await client.chat.completions.create(
203
- model=self.model_name,
204
- messages=messages,
205
- logprobs=True,
206
- top_logprobs=top_logprobs,
207
- )
208
- # Cost calculation
209
- input_tokens = completion.usage.prompt_tokens
210
- output_tokens = completion.usage.completion_tokens
211
- cost = self.calculate_cost(input_tokens, output_tokens)
212
- return completion, cost
213
-
214
- ###############################################
215
- # Utilities
216
- ###############################################
217
-
218
- def generate_prompt(
219
- self, multimodal_input: List[Union[str, MLLMImage]] = []
220
- ):
221
- prompt = []
222
- for ele in multimodal_input:
223
- if isinstance(ele, str):
224
- prompt.append({"type": "text", "text": ele})
225
- elif isinstance(ele, MLLMImage):
226
- if ele.local:
227
- import PIL.Image
228
-
229
- image = PIL.Image.open(ele.url)
230
- visual_dict = {
231
- "type": "image_url",
232
- "image_url": {
233
- "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
234
- },
235
- }
236
- else:
237
- visual_dict = {
238
- "type": "image_url",
239
- "image_url": {"url": ele.url},
240
- }
241
- prompt.append(visual_dict)
242
- return prompt
243
-
244
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
245
- pricing = model_pricing.get(
246
- self.model_name, model_pricing["gpt-4.1"]
247
- ) # Default to 'gpt-4.1' if model not found
248
- input_cost = input_tokens * pricing["input"]
249
- output_cost = output_tokens * pricing["output"]
250
- return input_cost + output_cost
251
-
252
- def encode_pil_image(self, pil_image):
253
- image_buffer = BytesIO()
254
- if pil_image.mode in ("RGBA", "LA", "P"):
255
- pil_image = pil_image.convert("RGB")
256
- pil_image.save(image_buffer, format="JPEG")
257
- image_bytes = image_buffer.getvalue()
258
- base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
259
- return base64_encoded_image
260
-
261
- ###############################################
262
- # Model
263
- ###############################################
264
-
265
- def get_model_name(self):
266
- return self.model_name
267
-
268
- def load_model(self, async_mode: bool = False):
269
- Client = AsyncOpenAI if async_mode else OpenAI
270
- return self._build_client(Client)
271
-
272
- def _client_kwargs(self) -> Dict:
273
- """
274
- If Tenacity is managing retries, force OpenAI SDK retries off to avoid
275
- double retries. If the user opts into SDK retries for 'openai' via
276
- DEEPEVAL_SDK_RETRY_PROVIDERS, leave their retry settings as is.
277
- """
278
- kwargs: Dict = {}
279
- if not sdk_retries_for(PS.OPENAI):
280
- kwargs["max_retries"] = 0
281
-
282
- if not kwargs.get("timeout"):
283
- kwargs["timeout"] = _request_timeout_seconds()
284
- return kwargs
285
-
286
- def _build_client(self, cls):
287
- api_key = require_secret_api_key(
288
- self._openai_api_key,
289
- provider_label="OpenAI",
290
- env_var_name="OPENAI_API_KEY",
291
- param_hint="`_openai_api_key` to MultimodalOpenAIModel(...)",
292
- )
293
-
294
- kw = dict(
295
- api_key=api_key,
296
- **self._client_kwargs(),
297
- )
298
- try:
299
- return cls(**kw)
300
- except TypeError as e:
301
- # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
302
- if "max_retries" in str(e):
303
- kw.pop("max_retries", None)
304
- return cls(**kw)
305
- raise
306
-
307
- def _client(self, async_mode: bool = False):
308
- # Backwards-compat path for internal callers in this module
309
- return self.load_model(async_mode=async_mode)
@@ -1,13 +0,0 @@
1
- from deepeval.optimization.prompt_optimizer import PromptOptimizer
2
- from deepeval.optimization.configs import OptimizerDisplayConfig
3
- from deepeval.optimization.gepa.loop import (
4
- GEPARunner as GEPARunner,
5
- GEPAConfig as GEPAConfig,
6
- )
7
-
8
- __all__ = [
9
- "GEPARunner",
10
- "GEPAConfig",
11
- "PromptOptimizer",
12
- "OptimizerDisplayConfig",
13
- ]
@@ -1,2 +0,0 @@
1
- # nothing yet
2
- __all__ = []