deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -56,7 +56,6 @@ from deepeval.telemetry import capture_evaluation_run
56
56
  from deepeval.metrics import (
57
57
  BaseMetric,
58
58
  BaseConversationalMetric,
59
- BaseMultimodalMetric,
60
59
  TaskCompletionMetric,
61
60
  )
62
61
  from deepeval.metrics.indicator import (
@@ -70,7 +69,6 @@ from deepeval.models.retry_policy import (
70
69
  from deepeval.test_case import (
71
70
  LLMTestCase,
72
71
  ConversationalTestCase,
73
- MLLMTestCase,
74
72
  )
75
73
  from deepeval.test_case.api import create_api_test_case
76
74
  from deepeval.test_run import (
@@ -263,13 +261,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
263
261
 
264
262
 
265
263
  def execute_test_cases(
266
- test_cases: Union[
267
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
268
- ],
264
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
269
265
  metrics: Union[
270
266
  List[BaseMetric],
271
267
  List[BaseConversationalMetric],
272
- List[BaseMultimodalMetric],
273
268
  ],
274
269
  error_config: Optional[ErrorConfig] = ErrorConfig(),
275
270
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -302,15 +297,12 @@ def execute_test_cases(
302
297
 
303
298
  conversational_metrics: List[BaseConversationalMetric] = []
304
299
  llm_metrics: List[BaseMetric] = []
305
- mllm_metrics: List[BaseMultimodalMetric] = []
306
300
  for metric in metrics:
307
301
  metric.async_mode = False
308
302
  if isinstance(metric, BaseMetric):
309
303
  llm_metrics.append(metric)
310
304
  elif isinstance(metric, BaseConversationalMetric):
311
305
  conversational_metrics.append(metric)
312
- elif isinstance(metric, BaseMultimodalMetric):
313
- mllm_metrics.append(metric)
314
306
 
315
307
  test_results: List[TestResult] = []
316
308
 
@@ -318,7 +310,6 @@ def execute_test_cases(
318
310
  progress: Optional[Progress] = None, pbar_id: Optional[int] = None
319
311
  ):
320
312
  llm_test_case_count = -1
321
- mllm_test_case_count = -1
322
313
  conversational_test_case_count = -1
323
314
  show_metric_indicator = (
324
315
  display_config.show_indicator and not _use_bar_indicator
@@ -330,11 +321,6 @@ def execute_test_cases(
330
321
  update_pbar(progress, pbar_id)
331
322
  continue
332
323
  per_case_total = len(llm_metrics)
333
- elif isinstance(test_case, MLLMTestCase):
334
- if not mllm_metrics:
335
- update_pbar(progress, pbar_id)
336
- continue
337
- per_case_total = len(mllm_metrics)
338
324
  elif isinstance(test_case, ConversationalTestCase):
339
325
  if not conversational_metrics:
340
326
  update_pbar(progress, pbar_id)
@@ -349,23 +335,15 @@ def execute_test_cases(
349
335
 
350
336
  metrics_for_case = (
351
337
  llm_metrics
352
- if isinstance(test_case, LLMTestCase)
353
- else (
354
- mllm_metrics
355
- if isinstance(test_case, MLLMTestCase)
356
- else conversational_metrics
357
- )
338
+ if (isinstance(test_case, LLMTestCase))
339
+ else conversational_metrics
358
340
  )
359
341
  api_test_case = create_api_test_case(
360
342
  test_case=test_case,
361
343
  index=(
362
344
  llm_test_case_count + 1
363
- if isinstance(test_case, LLMTestCase)
364
- else (
365
- mllm_test_case_count + 1
366
- if isinstance(test_case, MLLMTestCase)
367
- else conversational_test_case_count + 1
368
- )
345
+ if (isinstance(test_case, LLMTestCase))
346
+ else (conversational_test_case_count + 1)
369
347
  ),
370
348
  )
371
349
  emitted = [False] * len(metrics_for_case)
@@ -378,7 +356,7 @@ def execute_test_cases(
378
356
  try:
379
357
 
380
358
  def _run_case():
381
- nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
359
+ nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
382
360
  with capture_evaluation_run("test case"):
383
361
  for metric in metrics:
384
362
  metric.error = None # Reset metric error
@@ -435,26 +413,6 @@ def execute_test_cases(
435
413
  )
436
414
  update_pbar(progress, pbar_test_case_id)
437
415
 
438
- # No caching and not sending test cases to Confident AI for multimodal metrics yet
439
- elif isinstance(test_case, MLLMTestCase):
440
- mllm_test_case_count += 1
441
- for metric in mllm_metrics:
442
- current_index = index_of[id(metric)]
443
- res = _execute_metric(
444
- metric=metric,
445
- test_case=test_case,
446
- show_metric_indicator=show_metric_indicator,
447
- in_component=False,
448
- error_config=error_config,
449
- )
450
- if res == "skip":
451
- continue
452
-
453
- metric_data = create_metric_data(metric)
454
- api_test_case.update_metric_data(metric_data)
455
- emitted[current_index] = True
456
- update_pbar(progress, pbar_test_case_id)
457
-
458
416
  # No caching for conversational metrics yet
459
417
  elif isinstance(test_case, ConversationalTestCase):
460
418
  conversational_test_case_count += 1
@@ -560,13 +518,10 @@ def execute_test_cases(
560
518
 
561
519
 
562
520
  async def a_execute_test_cases(
563
- test_cases: Union[
564
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
565
- ],
521
+ test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
566
522
  metrics: Union[
567
523
  List[BaseMetric],
568
524
  List[BaseConversationalMetric],
569
- List[BaseMultimodalMetric],
570
525
  ],
571
526
  error_config: Optional[ErrorConfig] = ErrorConfig(),
572
527
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -600,20 +555,16 @@ async def a_execute_test_cases(
600
555
  metric.verbose_mode = display_config.verbose_mode
601
556
 
602
557
  llm_metrics: List[BaseMetric] = []
603
- mllm_metrics: List[BaseMultimodalMetric] = []
604
558
  conversational_metrics: List[BaseConversationalMetric] = []
605
559
  for metric in metrics:
606
560
  if isinstance(metric, BaseMetric):
607
561
  llm_metrics.append(metric)
608
- elif isinstance(metric, BaseMultimodalMetric):
609
- mllm_metrics.append(metric)
610
562
  elif isinstance(metric, BaseConversationalMetric):
611
563
  conversational_metrics.append(metric)
612
564
 
613
565
  llm_test_case_counter = -1
614
- mllm_test_case_counter = -1
615
566
  conversational_test_case_counter = -1
616
- test_results: List[Union[TestResult, MLLMTestCase]] = []
567
+ test_results: List[Union[TestResult, LLMTestCase]] = []
617
568
  tasks = []
618
569
 
619
570
  if display_config.show_indicator and _use_bar_indicator:
@@ -660,28 +611,6 @@ async def a_execute_test_cases(
660
611
  )
661
612
  tasks.append(asyncio.create_task(task))
662
613
 
663
- elif isinstance(test_case, MLLMTestCase):
664
- mllm_test_case_counter += 1
665
- copied_multimodal_metrics: List[
666
- BaseMultimodalMetric
667
- ] = copy_metrics(mllm_metrics)
668
- task = execute_with_semaphore(
669
- func=_a_execute_mllm_test_cases,
670
- metrics=copied_multimodal_metrics,
671
- test_case=test_case,
672
- test_run_manager=test_run_manager,
673
- test_results=test_results,
674
- count=mllm_test_case_counter,
675
- ignore_errors=error_config.ignore_errors,
676
- skip_on_missing_params=error_config.skip_on_missing_params,
677
- show_indicator=display_config.show_indicator,
678
- _use_bar_indicator=_use_bar_indicator,
679
- _is_assert_test=_is_assert_test,
680
- progress=progress,
681
- pbar_id=pbar_id,
682
- )
683
- tasks.append(asyncio.create_task(task))
684
-
685
614
  elif isinstance(test_case, ConversationalTestCase):
686
615
  conversational_test_case_counter += 1
687
616
 
@@ -772,26 +701,6 @@ async def a_execute_test_cases(
772
701
  )
773
702
  tasks.append(asyncio.create_task((task)))
774
703
 
775
- elif isinstance(test_case, MLLMTestCase):
776
- mllm_test_case_counter += 1
777
- copied_multimodal_metrics: List[BaseMultimodalMetric] = (
778
- copy_metrics(mllm_metrics)
779
- )
780
- task = execute_with_semaphore(
781
- func=_a_execute_mllm_test_cases,
782
- metrics=copied_multimodal_metrics,
783
- test_case=test_case,
784
- test_run_manager=test_run_manager,
785
- test_results=test_results,
786
- count=mllm_test_case_counter,
787
- ignore_errors=error_config.ignore_errors,
788
- skip_on_missing_params=error_config.skip_on_missing_params,
789
- _use_bar_indicator=_use_bar_indicator,
790
- _is_assert_test=_is_assert_test,
791
- show_indicator=display_config.show_indicator,
792
- )
793
- tasks.append(asyncio.create_task(task))
794
-
795
704
  await asyncio.sleep(async_config.throttle_value)
796
705
 
797
706
  try:
@@ -815,7 +724,7 @@ async def _a_execute_llm_test_cases(
815
724
  metrics: List[BaseMetric],
816
725
  test_case: LLMTestCase,
817
726
  test_run_manager: TestRunManager,
818
- test_results: List[Union[TestResult, MLLMTestCase]],
727
+ test_results: List[Union[TestResult, LLMTestCase]],
819
728
  count: int,
820
729
  test_run: TestRun,
821
730
  ignore_errors: bool,
@@ -932,88 +841,11 @@ async def _a_execute_llm_test_cases(
932
841
  update_pbar(progress, pbar_id)
933
842
 
934
843
 
935
- async def _a_execute_mllm_test_cases(
936
- metrics: List[BaseMultimodalMetric],
937
- test_case: MLLMTestCase,
938
- test_run_manager: TestRunManager,
939
- test_results: List[Union[TestResult, MLLMTestCase]],
940
- count: int,
941
- ignore_errors: bool,
942
- skip_on_missing_params: bool,
943
- show_indicator: bool,
944
- _use_bar_indicator: bool,
945
- _is_assert_test: bool,
946
- progress: Optional[Progress] = None,
947
- pbar_id: Optional[int] = None,
948
- ):
949
- show_metrics_indicator = show_indicator and not _use_bar_indicator
950
- pbar_test_case_id = add_pbar(
951
- progress,
952
- f" 🎯 Evaluating test case #{count}",
953
- total=len(metrics),
954
- )
955
-
956
- for metric in metrics:
957
- metric.skipped = False
958
- metric.error = None # Reset metric error
959
-
960
- api_test_case: LLMApiTestCase = create_api_test_case(
961
- test_case=test_case, index=count if not _is_assert_test else None
962
- )
963
- test_start_time = time.perf_counter()
964
- try:
965
- await measure_metrics_with_indicator(
966
- metrics=metrics,
967
- test_case=test_case,
968
- cached_test_case=None,
969
- skip_on_missing_params=skip_on_missing_params,
970
- ignore_errors=ignore_errors,
971
- show_indicator=show_metrics_indicator,
972
- pbar_eval_id=pbar_test_case_id,
973
- progress=progress,
974
- )
975
- except asyncio.CancelledError:
976
- msg = (
977
- "Timed out/cancelled while evaluating metric. "
978
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
979
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
980
- )
981
- for m in metrics:
982
- if getattr(m, "skipped", False):
983
- continue
984
- # If the task never finished and didn't set a terminal state, mark it now
985
- if getattr(m, "success", None) is None and not getattr(
986
- m, "error", None
987
- ):
988
- m.success = False
989
- m.error = msg
990
- if not ignore_errors:
991
- raise
992
- finally:
993
- for metric in metrics:
994
- if metric.skipped:
995
- continue
996
-
997
- metric_data = create_metric_data(metric)
998
- api_test_case.update_metric_data(metric_data)
999
-
1000
- test_end_time = time.perf_counter()
1001
- run_duration = test_end_time - test_start_time
1002
- api_test_case.update_run_duration(run_duration)
1003
-
1004
- ### Update Test Run ###
1005
- test_run_manager.update_test_run(api_test_case, test_case)
1006
- test_results.append(create_test_result(api_test_case))
1007
- update_pbar(progress, pbar_id)
1008
-
1009
-
1010
844
  async def _a_execute_conversational_test_cases(
1011
- metrics: List[
1012
- Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
1013
- ],
845
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
1014
846
  test_case: ConversationalTestCase,
1015
847
  test_run_manager: TestRunManager,
1016
- test_results: List[Union[TestResult, MLLMTestCase]],
848
+ test_results: List[Union[TestResult, LLMTestCase]],
1017
849
  count: int,
1018
850
  ignore_errors: bool,
1019
851
  skip_on_missing_params: bool,
@@ -1776,7 +1608,7 @@ async def a_execute_agentic_test_cases(
1776
1608
  async def _a_execute_agentic_test_case(
1777
1609
  golden: Golden,
1778
1610
  test_run_manager: TestRunManager,
1779
- test_results: List[Union[TestResult, MLLMTestCase]],
1611
+ test_results: List[Union[TestResult, LLMTestCase]],
1780
1612
  count: int,
1781
1613
  verbose_mode: Optional[bool],
1782
1614
  ignore_errors: bool,
@@ -3205,7 +3037,7 @@ async def _evaluate_test_case_pairs(
3205
3037
 
3206
3038
  def _execute_metric(
3207
3039
  metric: BaseMetric,
3208
- test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
3040
+ test_case: Union[LLMTestCase, ConversationalTestCase],
3209
3041
  show_metric_indicator: bool,
3210
3042
  in_component: bool,
3211
3043
  error_config: ErrorConfig,
@@ -11,12 +11,10 @@ from deepeval.metrics import (
11
11
  ArenaGEval,
12
12
  BaseMetric,
13
13
  BaseConversationalMetric,
14
- BaseMultimodalMetric,
15
14
  )
16
15
  from deepeval.test_case import (
17
16
  LLMTestCase,
18
17
  ConversationalTestCase,
19
- MLLMTestCase,
20
18
  )
21
19
  from deepeval.test_run import (
22
20
  LLMApiTestCase,
@@ -129,17 +127,14 @@ def create_test_result(
129
127
  turns=api_test_case.turns,
130
128
  )
131
129
  else:
132
- multimodal = (
133
- api_test_case.multimodal_input is not None
134
- and api_test_case.multimodal_input_actual_output is not None
135
- )
130
+ multimodal = api_test_case.images_mapping
136
131
  if multimodal:
137
132
  return TestResult(
138
133
  name=name,
139
134
  success=api_test_case.success,
140
135
  metrics_data=api_test_case.metrics_data,
141
- input=api_test_case.multimodal_input,
142
- actual_output=api_test_case.multimodal_input_actual_output,
136
+ input=api_test_case.input,
137
+ actual_output=api_test_case.actual_output,
143
138
  conversational=False,
144
139
  multimodal=True,
145
140
  additional_metadata=api_test_case.additional_metadata,
@@ -222,7 +217,7 @@ def validate_assert_test_inputs(
222
217
  )
223
218
 
224
219
  if test_case and metrics:
225
- if isinstance(test_case, LLMTestCase) and not all(
220
+ if (isinstance(test_case, LLMTestCase)) and not all(
226
221
  isinstance(metric, BaseMetric) for metric in metrics
227
222
  ):
228
223
  raise ValueError(
@@ -234,12 +229,6 @@ def validate_assert_test_inputs(
234
229
  raise ValueError(
235
230
  "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
236
231
  )
237
- if isinstance(test_case, MLLMTestCase) and not all(
238
- isinstance(metric, BaseMultimodalMetric) for metric in metrics
239
- ):
240
- raise ValueError(
241
- "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
242
- )
243
232
 
244
233
  if not ((golden and observed_callback) or (test_case and metrics)):
245
234
  raise ValueError(
@@ -251,15 +240,12 @@ def validate_evaluate_inputs(
251
240
  goldens: Optional[List] = None,
252
241
  observed_callback: Optional[Callable] = None,
253
242
  test_cases: Optional[
254
- Union[
255
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
256
- ]
243
+ Union[List[LLMTestCase], List[ConversationalTestCase]]
257
244
  ] = None,
258
245
  metrics: Optional[
259
246
  Union[
260
247
  List[BaseMetric],
261
248
  List[BaseConversationalMetric],
262
- List[BaseMultimodalMetric],
263
249
  ]
264
250
  ] = None,
265
251
  metric_collection: Optional[str] = None,
@@ -292,7 +278,7 @@ def validate_evaluate_inputs(
292
278
  if test_cases and metrics:
293
279
  for test_case in test_cases:
294
280
  for metric in metrics:
295
- if isinstance(test_case, LLMTestCase) and not isinstance(
281
+ if (isinstance(test_case, LLMTestCase)) and not isinstance(
296
282
  metric, BaseMetric
297
283
  ):
298
284
  raise ValueError(
@@ -305,12 +291,6 @@ def validate_evaluate_inputs(
305
291
  raise ValueError(
306
292
  f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
307
293
  )
308
- if isinstance(test_case, MLLMTestCase) and not isinstance(
309
- metric, BaseMultimodalMetric
310
- ):
311
- raise ValueError(
312
- f"Metric {metric.__name__} is not a valid metric for MLLMTestCase."
313
- )
314
294
 
315
295
 
316
296
  def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
@@ -1,12 +1,29 @@
1
1
  import warnings
2
+ from typing import TYPE_CHECKING, Any
2
3
 
3
4
  try:
4
- from pydantic_ai.agent import Agent
5
+ from pydantic_ai.agent import Agent as _BaseAgent
5
6
 
6
7
  is_pydantic_ai_installed = True
7
- except:
8
+ except ImportError:
8
9
  is_pydantic_ai_installed = False
9
10
 
11
+ class _BaseAgent:
12
+ """Dummy fallback so imports don't crash when pydantic-ai is missing."""
13
+
14
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
15
+ # No-op: for compatibility
16
+ pass
17
+
18
+
19
+ if TYPE_CHECKING:
20
+ # For type checkers: use the real Agent if available.
21
+ from pydantic_ai.agent import Agent # type: ignore[unused-ignore]
22
+ else:
23
+ # At runtime we always have some base: real Agent or our dummy.
24
+ # This is just to avoid blow-ups.
25
+ Agent = _BaseAgent
26
+
10
27
 
11
28
  class DeepEvalPydanticAIAgent(Agent):
12
29
 
@@ -1,40 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import os
4
6
  from time import perf_counter
5
- from typing import Literal, Optional, List
7
+ from typing import Any, List, Optional, TYPE_CHECKING
6
8
 
7
9
  from deepeval.config.settings import get_settings
8
10
  from deepeval.confident.api import get_confident_api_key
9
11
  from deepeval.metrics.base_metric import BaseMetric
10
12
  from deepeval.prompt import Prompt
11
13
  from deepeval.tracing.context import current_trace_context
12
- from deepeval.tracing.types import Trace
13
- from deepeval.tracing.otel.utils import to_hex_string
14
- from deepeval.tracing.tracing import trace_manager
15
- from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
16
14
  from deepeval.tracing.otel.exporter import ConfidentSpanExporter
17
-
15
+ from deepeval.tracing.otel.test_exporter import test_exporter
16
+ from deepeval.tracing.otel.utils import (
17
+ normalize_pydantic_ai_messages,
18
+ to_hex_string,
19
+ )
20
+ from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
21
+ from deepeval.tracing.tracing import trace_manager
22
+ from deepeval.tracing.types import (
23
+ AgentSpan,
24
+ Trace,
25
+ TraceSpanStatus,
26
+ ToolCall,
27
+ )
18
28
 
19
29
  logger = logging.getLogger(__name__)
20
30
 
21
-
22
31
  try:
23
- from pydantic_ai.models.instrumented import InstrumentationSettings
24
- from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
32
+ # Optional dependencies
33
+ from opentelemetry.sdk.trace import (
34
+ ReadableSpan as _ReadableSpan,
35
+ SpanProcessor as _SpanProcessor,
36
+ TracerProvider,
37
+ )
25
38
  from opentelemetry.sdk.trace.export import BatchSpanProcessor
26
39
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
27
40
  OTLPSpanExporter,
28
41
  )
29
- from opentelemetry.sdk.trace import ReadableSpan
42
+ from pydantic_ai.models.instrumented import (
43
+ InstrumentationSettings as _BaseInstrumentationSettings,
44
+ )
30
45
 
31
46
  dependency_installed = True
32
47
  except ImportError as e:
48
+ dependency_installed = False
49
+
50
+ # Preserve previous behavior: only log when verbose mode is enabled.
33
51
  if get_settings().DEEPEVAL_VERBOSE_MODE:
34
52
  if isinstance(e, ModuleNotFoundError):
35
53
  logger.warning(
36
54
  "Optional tracing dependency not installed: %s",
37
- e.name,
55
+ getattr(e, "name", repr(e)),
38
56
  stacklevel=2,
39
57
  )
40
58
  else:
@@ -43,26 +61,47 @@ except ImportError as e:
43
61
  e,
44
62
  stacklevel=2,
45
63
  )
46
- dependency_installed = False
64
+
65
+ # Dummy fallbacks so imports and class definitions don't crash when
66
+ # optional deps are missing. Actual use is still guarded by
67
+ # is_dependency_installed().
68
+ class _BaseInstrumentationSettings:
69
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
70
+ pass
71
+
72
+ class _SpanProcessor:
73
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
74
+ pass
75
+
76
+ def on_start(self, span: Any, parent_context: Any) -> None:
77
+ pass
78
+
79
+ def on_end(self, span: Any) -> None:
80
+ pass
81
+
82
+ class _ReadableSpan:
83
+ pass
47
84
 
48
85
 
49
- def is_dependency_installed():
86
+ def is_dependency_installed() -> bool:
50
87
  if not dependency_installed:
51
88
  raise ImportError(
52
- "Dependencies are not installed. Please install it with `pip install pydantic-ai opentelemetry-sdk opentelemetry-exporter-otlp-proto-http`."
89
+ "Dependencies are not installed. Please install it with "
90
+ "`pip install pydantic-ai opentelemetry-sdk "
91
+ "opentelemetry-exporter-otlp-proto-http`."
53
92
  )
54
93
  return True
55
94
 
56
95
 
57
- from deepeval.tracing.types import AgentSpan
58
- from deepeval.confident.api import get_confident_api_key
59
- from deepeval.prompt import Prompt
60
- from deepeval.tracing.otel.test_exporter import test_exporter
61
- from deepeval.tracing.context import current_trace_context
62
- from deepeval.tracing.types import Trace
63
- from deepeval.tracing.otel.utils import to_hex_string
64
- from deepeval.tracing.types import TraceSpanStatus, ToolCall
65
- from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
96
+ if TYPE_CHECKING:
97
+ # For type checkers, use real types
98
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
99
+ from pydantic_ai.models.instrumented import InstrumentationSettings
100
+ else:
101
+ # At runtime we always have something to subclass / annotate with
102
+ InstrumentationSettings = _BaseInstrumentationSettings
103
+ SpanProcessor = _SpanProcessor
104
+ ReadableSpan = _ReadableSpan
66
105
 
67
106
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
68
107
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
deepeval/key_handler.py CHANGED
@@ -99,7 +99,10 @@ class ModelKeyValues(Enum):
99
99
  class EmbeddingKeyValues(Enum):
100
100
  # Azure OpenAI
101
101
  USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
102
+ # Azure OpenAI
103
+ AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
102
104
  AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
105
+
103
106
  # Local
104
107
  USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
105
108
  LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
@@ -1,7 +1,6 @@
1
1
  from .base_metric import (
2
2
  BaseMetric,
3
3
  BaseConversationalMetric,
4
- BaseMultimodalMetric,
5
4
  BaseArenaMetric,
6
5
  )
7
6
 
@@ -42,6 +41,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
42
41
  from .turn_relevancy.turn_relevancy import (
43
42
  TurnRelevancyMetric,
44
43
  )
44
+ from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
45
+ from .turn_contextual_precision.turn_contextual_precision import (
46
+ TurnContextualPrecisionMetric,
47
+ )
48
+ from .turn_contextual_recall.turn_contextual_recall import (
49
+ TurnContextualRecallMetric,
50
+ )
51
+ from .turn_contextual_relevancy.turn_contextual_relevancy import (
52
+ TurnContextualRelevancyMetric,
53
+ )
45
54
  from .conversation_completeness.conversation_completeness import (
46
55
  ConversationCompletenessMetric,
47
56
  )
@@ -55,13 +64,6 @@ from .multimodal_metrics import (
55
64
  ImageCoherenceMetric,
56
65
  ImageHelpfulnessMetric,
57
66
  ImageReferenceMetric,
58
- MultimodalContextualRecallMetric,
59
- MultimodalContextualRelevancyMetric,
60
- MultimodalContextualPrecisionMetric,
61
- MultimodalAnswerRelevancyMetric,
62
- MultimodalFaithfulnessMetric,
63
- MultimodalToolCorrectnessMetric,
64
- MultimodalGEval,
65
67
  )
66
68
 
67
69
 
@@ -69,7 +71,6 @@ __all__ = [
69
71
  # Base classes
70
72
  "BaseMetric",
71
73
  "BaseConversationalMetric",
72
- "BaseMultimodalMetric",
73
74
  "BaseArenaMetric",
74
75
  # Non-LLM metrics
75
76
  "ExactMatchMetric",
@@ -119,17 +120,14 @@ __all__ = [
119
120
  # Conversational metrics
120
121
  "TurnRelevancyMetric",
121
122
  "ConversationCompletenessMetric",
123
+ "TurnFaithfulnessMetric",
124
+ "TurnContextualPrecisionMetric",
125
+ "TurnContextualRecallMetric",
126
+ "TurnContextualRelevancyMetric",
122
127
  # Multimodal metrics
123
128
  "TextToImageMetric",
124
129
  "ImageEditingMetric",
125
130
  "ImageCoherenceMetric",
126
131
  "ImageHelpfulnessMetric",
127
132
  "ImageReferenceMetric",
128
- "MultimodalContextualRecallMetric",
129
- "MultimodalContextualRelevancyMetric",
130
- "MultimodalContextualPrecisionMetric",
131
- "MultimodalAnswerRelevancyMetric",
132
- "MultimodalFaithfulnessMetric",
133
- "MultimodalToolCorrectnessMetric",
134
- "MultimodalGEval",
135
133
  ]