deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -56,15 +56,7 @@ from deepeval.telemetry import capture_evaluation_run
56
56
  from deepeval.metrics import (
57
57
  BaseMetric,
58
58
  BaseConversationalMetric,
59
- BaseMultimodalMetric,
60
59
  TaskCompletionMetric,
61
- # RAG metrics that support both single-turn and multimodal
62
- ContextualPrecisionMetric,
63
- ContextualRecallMetric,
64
- ContextualRelevancyMetric,
65
- AnswerRelevancyMetric,
66
- FaithfulnessMetric,
67
- ToolCorrectnessMetric,
68
60
  )
69
61
  from deepeval.metrics.indicator import (
70
62
  measure_metrics_with_indicator,
@@ -116,15 +108,6 @@ from deepeval.test_run.hyperparameters import (
116
108
 
117
109
  logger = logging.getLogger(__name__)
118
110
 
119
- MLLM_SUPPORTED_METRICS = [
120
- ContextualPrecisionMetric,
121
- ContextualRecallMetric,
122
- ContextualRelevancyMetric,
123
- AnswerRelevancyMetric,
124
- FaithfulnessMetric,
125
- ToolCorrectnessMetric,
126
- ]
127
-
128
111
 
129
112
  def _skip_metrics_for_error(
130
113
  span: Optional[BaseSpan] = None,
@@ -282,7 +265,6 @@ def execute_test_cases(
282
265
  metrics: Union[
283
266
  List[BaseMetric],
284
267
  List[BaseConversationalMetric],
285
- List[BaseMultimodalMetric],
286
268
  ],
287
269
  error_config: Optional[ErrorConfig] = ErrorConfig(),
288
270
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -315,17 +297,12 @@ def execute_test_cases(
315
297
 
316
298
  conversational_metrics: List[BaseConversationalMetric] = []
317
299
  llm_metrics: List[BaseMetric] = []
318
- mllm_metrics: List[BaseMultimodalMetric] = []
319
300
  for metric in metrics:
320
301
  metric.async_mode = False
321
302
  if isinstance(metric, BaseMetric):
322
303
  llm_metrics.append(metric)
323
- if type(metric) in MLLM_SUPPORTED_METRICS:
324
- mllm_metrics.append(metric)
325
304
  elif isinstance(metric, BaseConversationalMetric):
326
305
  conversational_metrics.append(metric)
327
- elif isinstance(metric, BaseMultimodalMetric):
328
- mllm_metrics.append(metric)
329
306
 
330
307
  test_results: List[TestResult] = []
331
308
 
@@ -333,23 +310,17 @@ def execute_test_cases(
333
310
  progress: Optional[Progress] = None, pbar_id: Optional[int] = None
334
311
  ):
335
312
  llm_test_case_count = -1
336
- mllm_test_case_count = -1
337
313
  conversational_test_case_count = -1
338
314
  show_metric_indicator = (
339
315
  display_config.show_indicator and not _use_bar_indicator
340
316
  )
341
317
  for i, test_case in enumerate(test_cases):
342
318
  # skip what we know we won't run
343
- if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
319
+ if isinstance(test_case, LLMTestCase):
344
320
  if not llm_metrics:
345
321
  update_pbar(progress, pbar_id)
346
322
  continue
347
323
  per_case_total = len(llm_metrics)
348
- elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
349
- if not mllm_metrics:
350
- update_pbar(progress, pbar_id)
351
- continue
352
- per_case_total = len(mllm_metrics)
353
324
  elif isinstance(test_case, ConversationalTestCase):
354
325
  if not conversational_metrics:
355
326
  update_pbar(progress, pbar_id)
@@ -364,35 +335,15 @@ def execute_test_cases(
364
335
 
365
336
  metrics_for_case = (
366
337
  llm_metrics
367
- if (
368
- isinstance(test_case, LLMTestCase)
369
- and not test_case.multimodal
370
- )
371
- else (
372
- mllm_metrics
373
- if (
374
- isinstance(test_case, LLMTestCase)
375
- and test_case.multimodal
376
- )
377
- else conversational_metrics
378
- )
338
+ if (isinstance(test_case, LLMTestCase))
339
+ else conversational_metrics
379
340
  )
380
341
  api_test_case = create_api_test_case(
381
342
  test_case=test_case,
382
343
  index=(
383
344
  llm_test_case_count + 1
384
- if (
385
- isinstance(test_case, LLMTestCase)
386
- and not test_case.multimodal
387
- )
388
- else (
389
- mllm_test_case_count + 1
390
- if (
391
- isinstance(test_case, LLMTestCase)
392
- and test_case.multimodal
393
- )
394
- else conversational_test_case_count + 1
395
- )
345
+ if (isinstance(test_case, LLMTestCase))
346
+ else (conversational_test_case_count + 1)
396
347
  ),
397
348
  )
398
349
  emitted = [False] * len(metrics_for_case)
@@ -405,15 +356,12 @@ def execute_test_cases(
405
356
  try:
406
357
 
407
358
  def _run_case():
408
- nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
359
+ nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
409
360
  with capture_evaluation_run("test case"):
410
361
  for metric in metrics:
411
362
  metric.error = None # Reset metric error
412
363
 
413
- if (
414
- isinstance(test_case, LLMTestCase)
415
- and not test_case.multimodal
416
- ):
364
+ if isinstance(test_case, LLMTestCase):
417
365
  llm_test_case_count += 1
418
366
  cached_test_case = None
419
367
  if cache_config.use_cache:
@@ -465,29 +413,6 @@ def execute_test_cases(
465
413
  )
466
414
  update_pbar(progress, pbar_test_case_id)
467
415
 
468
- # No caching and not sending test cases to Confident AI for multimodal metrics yet
469
- elif (
470
- isinstance(test_case, LLMTestCase)
471
- and test_case.multimodal
472
- ):
473
- mllm_test_case_count += 1
474
- for metric in mllm_metrics:
475
- current_index = index_of[id(metric)]
476
- res = _execute_metric(
477
- metric=metric,
478
- test_case=test_case,
479
- show_metric_indicator=show_metric_indicator,
480
- in_component=False,
481
- error_config=error_config,
482
- )
483
- if res == "skip":
484
- continue
485
-
486
- metric_data = create_metric_data(metric)
487
- api_test_case.update_metric_data(metric_data)
488
- emitted[current_index] = True
489
- update_pbar(progress, pbar_test_case_id)
490
-
491
416
  # No caching for conversational metrics yet
492
417
  elif isinstance(test_case, ConversationalTestCase):
493
418
  conversational_test_case_count += 1
@@ -597,7 +522,6 @@ async def a_execute_test_cases(
597
522
  metrics: Union[
598
523
  List[BaseMetric],
599
524
  List[BaseConversationalMetric],
600
- List[BaseMultimodalMetric],
601
525
  ],
602
526
  error_config: Optional[ErrorConfig] = ErrorConfig(),
603
527
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -631,20 +555,14 @@ async def a_execute_test_cases(
631
555
  metric.verbose_mode = display_config.verbose_mode
632
556
 
633
557
  llm_metrics: List[BaseMetric] = []
634
- mllm_metrics: List[BaseMultimodalMetric] = []
635
558
  conversational_metrics: List[BaseConversationalMetric] = []
636
559
  for metric in metrics:
637
560
  if isinstance(metric, BaseMetric):
638
561
  llm_metrics.append(metric)
639
- if type(metric) in MLLM_SUPPORTED_METRICS:
640
- mllm_metrics.append(metric)
641
- elif isinstance(metric, BaseMultimodalMetric):
642
- mllm_metrics.append(metric)
643
562
  elif isinstance(metric, BaseConversationalMetric):
644
563
  conversational_metrics.append(metric)
645
564
 
646
565
  llm_test_case_counter = -1
647
- mllm_test_case_counter = -1
648
566
  conversational_test_case_counter = -1
649
567
  test_results: List[Union[TestResult, LLMTestCase]] = []
650
568
  tasks = []
@@ -665,10 +583,7 @@ async def a_execute_test_cases(
665
583
  with progress:
666
584
  for test_case in test_cases:
667
585
  with capture_evaluation_run("test case"):
668
- if (
669
- isinstance(test_case, LLMTestCase)
670
- and not test_case.multimodal
671
- ):
586
+ if isinstance(test_case, LLMTestCase):
672
587
  if len(llm_metrics) == 0:
673
588
  update_pbar(progress, pbar_id)
674
589
  continue
@@ -696,31 +611,6 @@ async def a_execute_test_cases(
696
611
  )
697
612
  tasks.append(asyncio.create_task(task))
698
613
 
699
- elif (
700
- isinstance(test_case, LLMTestCase)
701
- and test_case.multimodal
702
- ):
703
- mllm_test_case_counter += 1
704
- copied_multimodal_metrics: List[
705
- BaseMultimodalMetric
706
- ] = copy_metrics(mllm_metrics)
707
- task = execute_with_semaphore(
708
- func=_a_execute_mllm_test_cases,
709
- metrics=copied_multimodal_metrics,
710
- test_case=test_case,
711
- test_run_manager=test_run_manager,
712
- test_results=test_results,
713
- count=mllm_test_case_counter,
714
- ignore_errors=error_config.ignore_errors,
715
- skip_on_missing_params=error_config.skip_on_missing_params,
716
- show_indicator=display_config.show_indicator,
717
- _use_bar_indicator=_use_bar_indicator,
718
- _is_assert_test=_is_assert_test,
719
- progress=progress,
720
- pbar_id=pbar_id,
721
- )
722
- tasks.append(asyncio.create_task(task))
723
-
724
614
  elif isinstance(test_case, ConversationalTestCase):
725
615
  conversational_test_case_counter += 1
726
616
 
@@ -763,10 +653,7 @@ async def a_execute_test_cases(
763
653
  else:
764
654
  for test_case in test_cases:
765
655
  with capture_evaluation_run("test case"):
766
- if (
767
- isinstance(test_case, LLMTestCase)
768
- and not test_case.multimodal
769
- ):
656
+ if isinstance(test_case, LLMTestCase):
770
657
  if len(llm_metrics) == 0:
771
658
  continue
772
659
  llm_test_case_counter += 1
@@ -814,28 +701,6 @@ async def a_execute_test_cases(
814
701
  )
815
702
  tasks.append(asyncio.create_task((task)))
816
703
 
817
- elif (
818
- isinstance(test_case, LLMTestCase) and test_case.multimodal
819
- ):
820
- mllm_test_case_counter += 1
821
- copied_multimodal_metrics: List[BaseMultimodalMetric] = (
822
- copy_metrics(mllm_metrics)
823
- )
824
- task = execute_with_semaphore(
825
- func=_a_execute_mllm_test_cases,
826
- metrics=copied_multimodal_metrics,
827
- test_case=test_case,
828
- test_run_manager=test_run_manager,
829
- test_results=test_results,
830
- count=mllm_test_case_counter,
831
- ignore_errors=error_config.ignore_errors,
832
- skip_on_missing_params=error_config.skip_on_missing_params,
833
- _use_bar_indicator=_use_bar_indicator,
834
- _is_assert_test=_is_assert_test,
835
- show_indicator=display_config.show_indicator,
836
- )
837
- tasks.append(asyncio.create_task(task))
838
-
839
704
  await asyncio.sleep(async_config.throttle_value)
840
705
 
841
706
  try:
@@ -976,85 +841,8 @@ async def _a_execute_llm_test_cases(
976
841
  update_pbar(progress, pbar_id)
977
842
 
978
843
 
979
- async def _a_execute_mllm_test_cases(
980
- metrics: List[BaseMultimodalMetric],
981
- test_case: LLMTestCase,
982
- test_run_manager: TestRunManager,
983
- test_results: List[Union[TestResult, LLMTestCase]],
984
- count: int,
985
- ignore_errors: bool,
986
- skip_on_missing_params: bool,
987
- show_indicator: bool,
988
- _use_bar_indicator: bool,
989
- _is_assert_test: bool,
990
- progress: Optional[Progress] = None,
991
- pbar_id: Optional[int] = None,
992
- ):
993
- show_metrics_indicator = show_indicator and not _use_bar_indicator
994
- pbar_test_case_id = add_pbar(
995
- progress,
996
- f" 🎯 Evaluating test case #{count}",
997
- total=len(metrics),
998
- )
999
-
1000
- for metric in metrics:
1001
- metric.skipped = False
1002
- metric.error = None # Reset metric error
1003
-
1004
- api_test_case: LLMApiTestCase = create_api_test_case(
1005
- test_case=test_case, index=count if not _is_assert_test else None
1006
- )
1007
- test_start_time = time.perf_counter()
1008
- try:
1009
- await measure_metrics_with_indicator(
1010
- metrics=metrics,
1011
- test_case=test_case,
1012
- cached_test_case=None,
1013
- skip_on_missing_params=skip_on_missing_params,
1014
- ignore_errors=ignore_errors,
1015
- show_indicator=show_metrics_indicator,
1016
- pbar_eval_id=pbar_test_case_id,
1017
- progress=progress,
1018
- )
1019
- except asyncio.CancelledError:
1020
- msg = (
1021
- "Timed out/cancelled while evaluating metric. "
1022
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1023
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1024
- )
1025
- for m in metrics:
1026
- if getattr(m, "skipped", False):
1027
- continue
1028
- # If the task never finished and didn't set a terminal state, mark it now
1029
- if getattr(m, "success", None) is None and not getattr(
1030
- m, "error", None
1031
- ):
1032
- m.success = False
1033
- m.error = msg
1034
- if not ignore_errors:
1035
- raise
1036
- finally:
1037
- for metric in metrics:
1038
- if metric.skipped:
1039
- continue
1040
-
1041
- metric_data = create_metric_data(metric)
1042
- api_test_case.update_metric_data(metric_data)
1043
-
1044
- test_end_time = time.perf_counter()
1045
- run_duration = test_end_time - test_start_time
1046
- api_test_case.update_run_duration(run_duration)
1047
-
1048
- ### Update Test Run ###
1049
- test_run_manager.update_test_run(api_test_case, test_case)
1050
- test_results.append(create_test_result(api_test_case))
1051
- update_pbar(progress, pbar_id)
1052
-
1053
-
1054
844
  async def _a_execute_conversational_test_cases(
1055
- metrics: List[
1056
- Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
1057
- ],
845
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
1058
846
  test_case: ConversationalTestCase,
1059
847
  test_run_manager: TestRunManager,
1060
848
  test_results: List[Union[TestResult, LLMTestCase]],
@@ -11,7 +11,6 @@ from deepeval.metrics import (
11
11
  ArenaGEval,
12
12
  BaseMetric,
13
13
  BaseConversationalMetric,
14
- BaseMultimodalMetric,
15
14
  )
16
15
  from deepeval.test_case import (
17
16
  LLMTestCase,
@@ -218,9 +217,9 @@ def validate_assert_test_inputs(
218
217
  )
219
218
 
220
219
  if test_case and metrics:
221
- if (
222
- isinstance(test_case, LLMTestCase) and not test_case.multimodal
223
- ) and not all(isinstance(metric, BaseMetric) for metric in metrics):
220
+ if (isinstance(test_case, LLMTestCase)) and not all(
221
+ isinstance(metric, BaseMetric) for metric in metrics
222
+ ):
224
223
  raise ValueError(
225
224
  "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
226
225
  )
@@ -230,18 +229,6 @@ def validate_assert_test_inputs(
230
229
  raise ValueError(
231
230
  "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
232
231
  )
233
- if (
234
- isinstance(test_case, LLMTestCase) and test_case.multimodal
235
- ) and not all(
236
- (
237
- isinstance(metric, BaseMultimodalMetric)
238
- or isinstance(metric, BaseMetric)
239
- )
240
- for metric in metrics
241
- ):
242
- raise ValueError(
243
- "All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
244
- )
245
232
 
246
233
  if not ((golden and observed_callback) or (test_case and metrics)):
247
234
  raise ValueError(
@@ -259,7 +246,6 @@ def validate_evaluate_inputs(
259
246
  Union[
260
247
  List[BaseMetric],
261
248
  List[BaseConversationalMetric],
262
- List[BaseMultimodalMetric],
263
249
  ]
264
250
  ] = None,
265
251
  metric_collection: Optional[str] = None,
@@ -292,10 +278,9 @@ def validate_evaluate_inputs(
292
278
  if test_cases and metrics:
293
279
  for test_case in test_cases:
294
280
  for metric in metrics:
295
- if (
296
- isinstance(test_case, LLMTestCase)
297
- and not test_case.multimodal
298
- ) and not isinstance(metric, BaseMetric):
281
+ if (isinstance(test_case, LLMTestCase)) and not isinstance(
282
+ metric, BaseMetric
283
+ ):
299
284
  raise ValueError(
300
285
  f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
301
286
  )
@@ -306,15 +291,6 @@ def validate_evaluate_inputs(
306
291
  raise ValueError(
307
292
  f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
308
293
  )
309
- if (
310
- isinstance(test_case, LLMTestCase) and test_case.multimodal
311
- ) and not (
312
- isinstance(metric, BaseMultimodalMetric)
313
- or isinstance(metric, BaseMetric)
314
- ):
315
- raise ValueError(
316
- f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
317
- )
318
294
 
319
295
 
320
296
  def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
deepeval/key_handler.py CHANGED
@@ -99,7 +99,10 @@ class ModelKeyValues(Enum):
99
99
  class EmbeddingKeyValues(Enum):
100
100
  # Azure OpenAI
101
101
  USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
102
+ # Azure OpenAI
103
+ AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
102
104
  AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
105
+
103
106
  # Local
104
107
  USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
105
108
  LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
@@ -1,7 +1,6 @@
1
1
  from .base_metric import (
2
2
  BaseMetric,
3
3
  BaseConversationalMetric,
4
- BaseMultimodalMetric,
5
4
  BaseArenaMetric,
6
5
  )
7
6
 
@@ -65,7 +64,6 @@ from .multimodal_metrics import (
65
64
  ImageCoherenceMetric,
66
65
  ImageHelpfulnessMetric,
67
66
  ImageReferenceMetric,
68
- MultimodalGEval,
69
67
  )
70
68
 
71
69
 
@@ -73,7 +71,6 @@ __all__ = [
73
71
  # Base classes
74
72
  "BaseMetric",
75
73
  "BaseConversationalMetric",
76
- "BaseMultimodalMetric",
77
74
  "BaseArenaMetric",
78
75
  # Non-LLM metrics
79
76
  "ExactMatchMetric",
@@ -133,5 +130,4 @@ __all__ = [
133
130
  "ImageCoherenceMetric",
134
131
  "ImageHelpfulnessMetric",
135
132
  "ImageReferenceMetric",
136
- "MultimodalGEval",
137
133
  ]