deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -11,15 +11,19 @@ from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
12
  get_turns_in_sliding_window,
13
13
  get_unit_interactions,
14
- trimAndLoadJson,
15
14
  initialize_model,
16
15
  convert_turn_to_dict,
16
+ a_generate_with_schema_and_extract,
17
+ generate_with_schema_and_extract,
17
18
  )
18
19
  from deepeval.models import DeepEvalBaseLLM
19
20
  from deepeval.metrics.indicator import metric_progress_indicator
20
21
  from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
21
22
  from deepeval.utils import get_or_create_event_loop, prettify_list
22
- from deepeval.metrics.turn_relevancy.schema import *
23
+ from deepeval.metrics.turn_relevancy.schema import (
24
+ TurnRelevancyVerdict,
25
+ TurnRelevancyScoreReason,
26
+ )
23
27
  from deepeval.metrics.api import metric_data_manager
24
28
 
25
29
 
@@ -53,7 +57,12 @@ class TurnRelevancyMetric(BaseConversationalMetric):
53
57
  _log_metric_to_confident: bool = True,
54
58
  ):
55
59
  check_conversational_test_case_params(
56
- test_case, self._required_test_case_params, self
60
+ test_case,
61
+ self._required_test_case_params,
62
+ self,
63
+ False,
64
+ self.model,
65
+ test_case.multimodal,
57
66
  )
58
67
 
59
68
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -108,7 +117,12 @@ class TurnRelevancyMetric(BaseConversationalMetric):
108
117
  _log_metric_to_confident: bool = True,
109
118
  ) -> float:
110
119
  check_conversational_test_case_params(
111
- test_case, self._required_test_case_params, self
120
+ test_case,
121
+ self._required_test_case_params,
122
+ self,
123
+ False,
124
+ self.model,
125
+ test_case.multimodal,
112
126
  )
113
127
 
114
128
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -148,7 +162,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
148
162
  )
149
163
  return self.score
150
164
 
151
- async def _a_generate_reason(self) -> str:
165
+ async def _a_generate_reason(self) -> Optional[str]:
152
166
  if self.include_reason is False:
153
167
  return None
154
168
 
@@ -162,24 +176,19 @@ class TurnRelevancyMetric(BaseConversationalMetric):
162
176
  prompt = TurnRelevancyTemplate.generate_reason(
163
177
  score=self.score, irrelevancies=irrelevancies
164
178
  )
165
- if self.using_native_model:
166
- res, cost = await self.model.a_generate(
167
- prompt, schema=TurnRelevancyScoreReason
168
- )
169
- self.evaluation_cost += cost
170
- return res.reason
171
- else:
172
- try:
173
- res: TurnRelevancyScoreReason = await self.model.a_generate(
174
- prompt, schema=TurnRelevancyScoreReason
175
- )
176
- return res.reason
177
- except TypeError:
178
- res = await self.model.a_generate(prompt)
179
- data = trimAndLoadJson(res, self)
180
- return data["reason"]
181
179
 
182
- def _generate_reason(self) -> str:
180
+ return await a_generate_with_schema_and_extract(
181
+ metric=self,
182
+ prompt=prompt,
183
+ schema_cls=TurnRelevancyScoreReason,
184
+ extract_schema=lambda s: s.reason,
185
+ extract_json=lambda data: data["reason"],
186
+ )
187
+
188
+ def _generate_reason(self) -> Optional[str]:
189
+ if self.include_reason is False:
190
+ return None
191
+
183
192
  irrelevancies: List[Dict[str, str]] = []
184
193
  for index, verdict in enumerate(self.verdicts):
185
194
  if verdict.verdict.strip().lower() == "no":
@@ -190,22 +199,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
190
199
  prompt = TurnRelevancyTemplate.generate_reason(
191
200
  score=self.score, irrelevancies=irrelevancies
192
201
  )
193
- if self.using_native_model:
194
- res, cost = self.model.generate(
195
- prompt, schema=TurnRelevancyScoreReason
196
- )
197
- self.evaluation_cost += cost
198
- return res.reason
199
- else:
200
- try:
201
- res: TurnRelevancyScoreReason = self.model.generate(
202
- prompt, schema=TurnRelevancyScoreReason
203
- )
204
- return res.reason
205
- except TypeError:
206
- res = self.model.generate(prompt)
207
- data = trimAndLoadJson(res, self)
208
- return data["reason"]
202
+
203
+ return generate_with_schema_and_extract(
204
+ metric=self,
205
+ prompt=prompt,
206
+ schema_cls=TurnRelevancyScoreReason,
207
+ extract_schema=lambda s: s.reason,
208
+ extract_json=lambda data: data["reason"],
209
+ )
209
210
 
210
211
  async def _a_generate_verdict(
211
212
  self, turns_sliding_window: List[Turn]
@@ -215,22 +216,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
215
216
  convert_turn_to_dict(turn) for turn in turns_sliding_window
216
217
  ]
217
218
  )
218
- if self.using_native_model:
219
- res, cost = await self.model.a_generate(
220
- prompt, schema=TurnRelevancyVerdict
221
- )
222
- self.evaluation_cost += cost
223
- return res
224
- else:
225
- try:
226
- res: TurnRelevancyVerdict = await self.model.a_generate(
227
- prompt, schema=TurnRelevancyVerdict
228
- )
229
- return res
230
- except TypeError:
231
- res = await self.model.a_generate(prompt)
232
- data = trimAndLoadJson(res, self)
233
- return TurnRelevancyVerdict(**data)
219
+
220
+ return await a_generate_with_schema_and_extract(
221
+ metric=self,
222
+ prompt=prompt,
223
+ schema_cls=TurnRelevancyVerdict,
224
+ extract_schema=lambda s: s,
225
+ extract_json=lambda data: TurnRelevancyVerdict(**data),
226
+ )
234
227
 
235
228
  def _generate_verdict(
236
229
  self, turns_sliding_window: List[Turn]
@@ -240,20 +233,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
240
233
  convert_turn_to_dict(turn) for turn in turns_sliding_window
241
234
  ]
242
235
  )
243
- if self.using_native_model:
244
- res, cost = self.model.generate(prompt, schema=TurnRelevancyVerdict)
245
- self.evaluation_cost += cost
246
- return res
247
- else:
248
- try:
249
- res: TurnRelevancyVerdict = self.model.generate(
250
- prompt, schema=TurnRelevancyVerdict
251
- )
252
- return res
253
- except TypeError:
254
- res = self.model.generate(prompt)
255
- data = trimAndLoadJson(res, self)
256
- return TurnRelevancyVerdict(**data)
236
+
237
+ return generate_with_schema_and_extract(
238
+ metric=self,
239
+ prompt=prompt,
240
+ schema_cls=TurnRelevancyVerdict,
241
+ extract_schema=lambda s: s,
242
+ extract_json=lambda data: TurnRelevancyVerdict(**data),
243
+ )
257
244
 
258
245
  def _calculate_score(self) -> float:
259
246
  number_of_verdicts = len(self.verdicts)
@@ -274,7 +261,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
274
261
  else:
275
262
  try:
276
263
  self.score >= self.threshold
277
- except:
264
+ except TypeError:
278
265
  self.success = False
279
266
  return self.success
280
267
 
deepeval/metrics/utils.py CHANGED
@@ -2,7 +2,17 @@ import inspect
2
2
  import json
3
3
  import re
4
4
  import sys
5
- from typing import Any, Dict, Optional, List, Union, Tuple
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Optional,
11
+ Tuple,
12
+ Type,
13
+ TypeVar,
14
+ Union,
15
+ )
6
16
 
7
17
  from deepeval.errors import (
8
18
  MissingTestCaseParamsError,
@@ -22,10 +32,19 @@ from deepeval.models import (
22
32
  GeminiModel,
23
33
  AmazonBedrockModel,
24
34
  LiteLLMModel,
35
+ PortkeyModel,
25
36
  KimiModel,
26
37
  GrokModel,
27
38
  DeepSeekModel,
28
39
  )
40
+ from deepeval.models.llms.constants import (
41
+ OPENAI_MODELS_DATA,
42
+ GEMINI_MODELS_DATA,
43
+ OLLAMA_MODELS_DATA,
44
+ ANTHROPIC_MODELS_DATA,
45
+ GROK_MODELS_DATA,
46
+ KIMI_MODELS_DATA,
47
+ )
29
48
  from deepeval.key_handler import (
30
49
  ModelKeyValues,
31
50
  EmbeddingKeyValues,
@@ -34,7 +53,6 @@ from deepeval.key_handler import (
34
53
  from deepeval.metrics import (
35
54
  BaseMetric,
36
55
  BaseConversationalMetric,
37
- BaseMultimodalMetric,
38
56
  BaseArenaMetric,
39
57
  )
40
58
  from deepeval.models.base_model import DeepEvalBaseEmbeddingModel
@@ -49,19 +67,20 @@ from deepeval.test_case import (
49
67
  TurnParams,
50
68
  )
51
69
 
52
- MULTIMODAL_SUPPORTED_MODELS = [
53
- GPTModel,
54
- GeminiModel,
55
- OllamaModel,
56
- AzureOpenAIModel,
57
- ]
70
+ MULTIMODAL_SUPPORTED_MODELS = {
71
+ GPTModel: OPENAI_MODELS_DATA,
72
+ GeminiModel: GEMINI_MODELS_DATA,
73
+ OllamaModel: OLLAMA_MODELS_DATA,
74
+ AzureOpenAIModel: OPENAI_MODELS_DATA,
75
+ KimiModel: KIMI_MODELS_DATA,
76
+ AnthropicModel: ANTHROPIC_MODELS_DATA,
77
+ GrokModel: GROK_MODELS_DATA,
78
+ }
58
79
 
59
80
 
60
81
  def copy_metrics(
61
- metrics: List[
62
- Union[BaseMetric, BaseConversationalMetric, BaseMultimodalMetric]
63
- ],
64
- ) -> List[Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]]:
82
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
83
+ ) -> List[Union[BaseMetric, BaseConversationalMetric]]:
65
84
  copied_metrics = []
66
85
  for metric in metrics:
67
86
  metric_class = type(metric)
@@ -204,13 +223,21 @@ def check_conversational_test_case_params(
204
223
  ):
205
224
  if multimodal:
206
225
  if not model or not model.supports_multimodal():
207
- if model and type(model) in MULTIMODAL_SUPPORTED_MODELS:
226
+ if model and type(model) in MULTIMODAL_SUPPORTED_MODELS.keys():
227
+ valid_multimodal_models = []
228
+ for model_name, model_data in MULTIMODAL_SUPPORTED_MODELS.get(
229
+ type(model)
230
+ ).items():
231
+ if callable(model_data):
232
+ model_data = model_data()
233
+ if model_data.supports_multimodal:
234
+ valid_multimodal_models.append(model_name)
208
235
  raise ValueError(
209
- f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(model.__class__.valid_multimodal_models)}."
236
+ f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(valid_multimodal_models)}."
210
237
  )
211
238
  else:
212
239
  raise ValueError(
213
- f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
240
+ f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS.keys()])}"
214
241
  )
215
242
 
216
243
  if isinstance(test_case, ConversationalTestCase) is False:
@@ -245,8 +272,49 @@ def check_conversational_test_case_params(
245
272
  def check_llm_test_case_params(
246
273
  test_case: LLMTestCase,
247
274
  test_case_params: List[LLMTestCaseParams],
275
+ input_image_count: Optional[int],
276
+ actual_output_image_count: Optional[int],
248
277
  metric: Union[BaseMetric, BaseArenaMetric],
278
+ model: Optional[DeepEvalBaseLLM] = None,
279
+ multimodal: Optional[bool] = False,
249
280
  ):
281
+ if multimodal:
282
+ if not model or not model.supports_multimodal():
283
+ if model and type(model) in MULTIMODAL_SUPPORTED_MODELS.keys():
284
+ valid_multimodal_models = []
285
+ for model_name, model_data in MULTIMODAL_SUPPORTED_MODELS.get(
286
+ type(model)
287
+ ).items():
288
+ if callable(model_data):
289
+ model_data = model_data()
290
+ if model_data.supports_multimodal:
291
+ valid_multimodal_models.append(model_name)
292
+ raise ValueError(
293
+ f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(valid_multimodal_models)}."
294
+ )
295
+ else:
296
+ raise ValueError(
297
+ f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS.keys()])}"
298
+ )
299
+
300
+ if input_image_count:
301
+ count = 0
302
+ for ele in convert_to_multi_modal_array(test_case.input):
303
+ if isinstance(ele, MLLMImage):
304
+ count += 1
305
+ if count != input_image_count:
306
+ error_str = f"Can only evaluate test cases with '{input_image_count}' input images using the '{metric.__name__}' metric. `{count}` found."
307
+ raise ValueError(error_str)
308
+
309
+ if actual_output_image_count:
310
+ count = 0
311
+ for ele in convert_to_multi_modal_array(test_case.actual_output):
312
+ if isinstance(ele, MLLMImage):
313
+ count += 1
314
+ if count != actual_output_image_count:
315
+ error_str = f"Unable to evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
316
+ raise ValueError(error_str)
317
+
250
318
  if isinstance(test_case, LLMTestCase) is False:
251
319
  error_str = f"Unable to evaluate test cases that are not of type 'LLMTestCase' using the non-conversational '{metric.__name__}' metric."
252
320
  metric.error = error_str
@@ -276,6 +344,8 @@ def check_arena_test_case_params(
276
344
  arena_test_case: ArenaTestCase,
277
345
  test_case_params: List[LLMTestCaseParams],
278
346
  metric: BaseArenaMetric,
347
+ model: Optional[DeepEvalBaseLLM] = None,
348
+ multimodal: Optional[bool] = False,
279
349
  ):
280
350
  if not isinstance(arena_test_case, ArenaTestCase):
281
351
  raise ValueError(
@@ -296,79 +366,8 @@ def check_arena_test_case_params(
296
366
  )
297
367
 
298
368
  for test_case in cases:
299
- check_llm_test_case_params(test_case, test_case_params, metric)
300
-
301
-
302
- def check_mllm_test_case_params(
303
- test_case: LLMTestCase,
304
- test_case_params: List[LLMTestCaseParams],
305
- input_image_count: Optional[int],
306
- actual_output_image_count: Optional[int],
307
- metric: BaseMetric,
308
- model: Optional[DeepEvalBaseLLM] = None,
309
- ):
310
- if not model or not model.supports_multimodal():
311
- if model and type(model) in MULTIMODAL_SUPPORTED_MODELS:
312
- raise ValueError(
313
- f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(model.__class__.valid_multimodal_models)}."
314
- )
315
- else:
316
- raise ValueError(
317
- f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
318
- )
319
-
320
- if input_image_count:
321
- count = 0
322
- for ele in convert_to_multi_modal_array(test_case.input):
323
- if isinstance(ele, MLLMImage):
324
- count += 1
325
- if count != input_image_count:
326
- error_str = f"Can only evaluate test cases with '{input_image_count}' input images using the '{metric.__name__}' metric. `{count}` found."
327
- raise ValueError(error_str)
328
-
329
- if actual_output_image_count:
330
- count = 0
331
- for ele in convert_to_multi_modal_array(test_case.actual_output):
332
- if isinstance(ele, MLLMImage):
333
- count += 1
334
- if count != actual_output_image_count:
335
- error_str = f"Unable to evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
336
- raise ValueError(error_str)
337
-
338
- missing_params = []
339
- for param in test_case_params:
340
- if getattr(test_case, param.value) is None:
341
- missing_params.append(f"'{param.value}'")
342
-
343
- if missing_params:
344
- if len(missing_params) == 1:
345
- missing_params_str = missing_params[0]
346
- elif len(missing_params) == 2:
347
- missing_params_str = " and ".join(missing_params)
348
- else:
349
- missing_params_str = (
350
- ", ".join(missing_params[:-1]) + ", and " + missing_params[-1]
351
- )
352
-
353
- error_str = f"{missing_params_str} cannot be None for the '{metric.__name__}' metric"
354
- metric.error = error_str
355
- raise MissingTestCaseParamsError(error_str)
356
-
357
-
358
- def check_mllm_test_cases_params(
359
- test_cases: List[LLMTestCase],
360
- test_case_params: List[LLMTestCaseParams],
361
- input_image_count: Optional[int],
362
- actual_output_image_count: Optional[int],
363
- metric: BaseMetric,
364
- ):
365
- for test_case in test_cases:
366
- check_mllm_test_case_params(
367
- test_case,
368
- test_case_params,
369
- input_image_count,
370
- actual_output_image_count,
371
- metric,
369
+ check_llm_test_case_params(
370
+ test_case, test_case_params, None, None, metric, model, multimodal
372
371
  )
373
372
 
374
373
 
@@ -398,11 +397,73 @@ def trimAndLoadJson(
398
397
  raise Exception(f"An unexpected error occurred: {str(e)}")
399
398
 
400
399
 
400
+ SchemaType = TypeVar("SchemaType")
401
+ ReturnType = TypeVar("ReturnType")
402
+
403
+
404
+ def generate_with_schema_and_extract(
405
+ metric: Union[BaseMetric, BaseArenaMetric, BaseConversationalMetric],
406
+ prompt: Any,
407
+ schema_cls: Type[SchemaType],
408
+ *,
409
+ extract_schema: Callable[[SchemaType], ReturnType],
410
+ extract_json: Callable[[Dict[str, Any]], ReturnType],
411
+ ) -> ReturnType:
412
+ """
413
+ Synchronous wrapper:
414
+ - calls model.generate_with_schema(...)
415
+ - accrues cost if applicable
416
+ - if schema instance -> extract_schema
417
+ else parse JSON -> extract_json
418
+ """
419
+ if metric.using_native_model:
420
+ result, cost = metric.model.generate_with_schema(
421
+ prompt, schema=schema_cls
422
+ )
423
+ metric._accrue_cost(cost)
424
+ else:
425
+ result = metric.model.generate_with_schema(prompt, schema=schema_cls)
426
+ if isinstance(result, schema_cls):
427
+ return extract_schema(result)
428
+ data = trimAndLoadJson(result, metric)
429
+ return extract_json(data)
430
+
431
+
432
+ async def a_generate_with_schema_and_extract(
433
+ metric: Union[BaseMetric, BaseArenaMetric, BaseConversationalMetric],
434
+ prompt: Any,
435
+ schema_cls: Type[SchemaType],
436
+ *,
437
+ extract_schema: Callable[[SchemaType], ReturnType],
438
+ extract_json: Callable[[Dict[str, Any]], ReturnType],
439
+ ) -> ReturnType:
440
+ if metric.using_native_model:
441
+ result, cost = await metric.model.a_generate_with_schema(
442
+ prompt, schema=schema_cls
443
+ )
444
+ metric._accrue_cost(cost)
445
+ else:
446
+ result = await metric.model.a_generate_with_schema(
447
+ prompt, schema=schema_cls
448
+ )
449
+
450
+ if isinstance(result, schema_cls):
451
+ return extract_schema(result)
452
+
453
+ data = trimAndLoadJson(result, metric)
454
+ return extract_json(data)
455
+
456
+
401
457
  ###############################################
402
458
  # Default Model Providers
403
459
  ###############################################
404
460
 
405
461
 
462
+ def should_use_anthropic_model():
463
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_ANTHROPIC_MODEL)
464
+ return value.lower() == "yes" if value is not None else False
465
+
466
+
406
467
  def should_use_azure_openai():
407
468
  value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_AZURE_OPENAI)
408
469
  return value.lower() == "yes" if value is not None else False
@@ -414,8 +475,8 @@ def should_use_local_model():
414
475
 
415
476
 
416
477
  def should_use_ollama_model():
417
- base_url = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.LOCAL_MODEL_API_KEY)
418
- return base_url == "ollama"
478
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.LOCAL_MODEL_API_KEY)
479
+ return value == "ollama"
419
480
 
420
481
 
421
482
  def should_use_gemini_model():
@@ -433,6 +494,11 @@ def should_use_litellm():
433
494
  return value.lower() == "yes" if value is not None else False
434
495
 
435
496
 
497
+ def should_use_portkey():
498
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_PORTKEY_MODEL)
499
+ return value.lower() == "yes" if value is not None else False
500
+
501
+
436
502
  def should_use_deepseek_model():
437
503
  value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_DEEPSEEK_MODEL)
438
504
  return value.lower() == "yes" if value is not None else False
@@ -471,6 +537,8 @@ def initialize_model(
471
537
  return GeminiModel(), True
472
538
  if should_use_litellm():
473
539
  return LiteLLMModel(), True
540
+ if should_use_portkey():
541
+ return PortkeyModel(), True
474
542
  if should_use_ollama_model():
475
543
  return OllamaModel(), True
476
544
  elif should_use_local_model():
@@ -480,9 +548,11 @@ def initialize_model(
480
548
  elif should_use_moonshot_model():
481
549
  return KimiModel(model=model), True
482
550
  elif should_use_grok_model():
483
- return GrokModel(model=model), True
551
+ return GrokModel(), True
484
552
  elif should_use_deepseek_model():
485
553
  return DeepSeekModel(model=model), True
554
+ elif should_use_anthropic_model():
555
+ return AnthropicModel(), True
486
556
  elif isinstance(model, str) or model is None:
487
557
  return GPTModel(model=model), True
488
558
 
@@ -15,6 +15,7 @@ from deepeval.models.llms import (
15
15
  KimiModel,
16
16
  GrokModel,
17
17
  DeepSeekModel,
18
+ PortkeyModel,
18
19
  )
19
20
  from deepeval.models.embedding_models import (
20
21
  OpenAIEmbeddingModel,
@@ -42,4 +43,5 @@ __all__ = [
42
43
  "AzureOpenAIEmbeddingModel",
43
44
  "LocalEmbeddingModel",
44
45
  "OllamaEmbeddingModel",
46
+ "PortkeyModel",
45
47
  ]
@@ -1,6 +1,18 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Optional, List
2
+ from typing import Any, Optional, List, Union
3
3
  from deepeval.models.utils import parse_model_name
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass
8
+ class DeepEvalModelData:
9
+ supports_log_probs: Optional[bool] = None
10
+ supports_multimodal: Optional[bool] = None
11
+ supports_structured_outputs: Optional[bool] = None
12
+ supports_json: Optional[bool] = None
13
+ input_price: Optional[float] = None
14
+ output_price: Optional[float] = None
15
+ supports_temperature: Optional[bool] = True
4
16
 
5
17
 
6
18
  class DeepEvalBaseModel(ABC):
@@ -66,9 +78,6 @@ class DeepEvalBaseLLM(ABC):
66
78
  def get_model_name(self, *args, **kwargs) -> str:
67
79
  return self.name
68
80
 
69
- def supports_multimodal(self) -> bool:
70
- return False
71
-
72
81
  def batch_generate(self, *args, **kwargs) -> List[str]:
73
82
  """Runs the model to output LLM responses.
74
83
 
@@ -79,8 +88,37 @@ class DeepEvalBaseLLM(ABC):
79
88
  "batch_generate is not implemented for this model"
80
89
  )
81
90
 
82
- def supports_multimodal(self):
83
- return False
91
+ # Capabilities
92
+ def supports_log_probs(self) -> Union[bool, None]:
93
+ return None
94
+
95
+ def supports_temperature(self) -> Union[bool, None]:
96
+ return None
97
+
98
+ def supports_multimodal(self) -> Union[bool, None]:
99
+ return None
100
+
101
+ def supports_structured_outputs(self) -> Union[bool, None]:
102
+ return None
103
+
104
+ def supports_json_mode(self) -> Union[bool, None]:
105
+ return None
106
+
107
+ def generate_with_schema(self, *args, schema=None, **kwargs):
108
+ if schema is not None:
109
+ try:
110
+ return self.generate(*args, schema=schema, **kwargs)
111
+ except TypeError:
112
+ pass # this means provider doesn't accept schema kwarg
113
+ return self.generate(*args, **kwargs)
114
+
115
+ async def a_generate_with_schema(self, *args, schema=None, **kwargs):
116
+ if schema is not None:
117
+ try:
118
+ return await self.a_generate(*args, schema=schema, **kwargs)
119
+ except TypeError:
120
+ pass
121
+ return await self.a_generate(*args, **kwargs)
84
122
 
85
123
 
86
124
  class DeepEvalBaseEmbeddingModel(ABC):