deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -16,7 +16,6 @@ from deepeval.metrics import (
16
16
  from deepeval.test_case import (
17
17
  LLMTestCase,
18
18
  ConversationalTestCase,
19
- MLLMTestCase,
20
19
  )
21
20
  from deepeval.test_run import (
22
21
  LLMApiTestCase,
@@ -129,17 +128,14 @@ def create_test_result(
129
128
  turns=api_test_case.turns,
130
129
  )
131
130
  else:
132
- multimodal = (
133
- api_test_case.multimodal_input is not None
134
- and api_test_case.multimodal_input_actual_output is not None
135
- )
131
+ multimodal = api_test_case.images_mapping
136
132
  if multimodal:
137
133
  return TestResult(
138
134
  name=name,
139
135
  success=api_test_case.success,
140
136
  metrics_data=api_test_case.metrics_data,
141
- input=api_test_case.multimodal_input,
142
- actual_output=api_test_case.multimodal_input_actual_output,
137
+ input=api_test_case.input,
138
+ actual_output=api_test_case.actual_output,
143
139
  conversational=False,
144
140
  multimodal=True,
145
141
  additional_metadata=api_test_case.additional_metadata,
@@ -222,9 +218,9 @@ def validate_assert_test_inputs(
222
218
  )
223
219
 
224
220
  if test_case and metrics:
225
- if isinstance(test_case, LLMTestCase) and not all(
226
- isinstance(metric, BaseMetric) for metric in metrics
227
- ):
221
+ if (
222
+ isinstance(test_case, LLMTestCase) and not test_case.multimodal
223
+ ) and not all(isinstance(metric, BaseMetric) for metric in metrics):
228
224
  raise ValueError(
229
225
  "All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
230
226
  )
@@ -234,11 +230,17 @@ def validate_assert_test_inputs(
234
230
  raise ValueError(
235
231
  "All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
236
232
  )
237
- if isinstance(test_case, MLLMTestCase) and not all(
238
- isinstance(metric, BaseMultimodalMetric) for metric in metrics
233
+ if (
234
+ isinstance(test_case, LLMTestCase) and test_case.multimodal
235
+ ) and not all(
236
+ (
237
+ isinstance(metric, BaseMultimodalMetric)
238
+ or isinstance(metric, BaseMetric)
239
+ )
240
+ for metric in metrics
239
241
  ):
240
242
  raise ValueError(
241
- "All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
243
+ "All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
242
244
  )
243
245
 
244
246
  if not ((golden and observed_callback) or (test_case and metrics)):
@@ -251,9 +253,7 @@ def validate_evaluate_inputs(
251
253
  goldens: Optional[List] = None,
252
254
  observed_callback: Optional[Callable] = None,
253
255
  test_cases: Optional[
254
- Union[
255
- List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
256
- ]
256
+ Union[List[LLMTestCase], List[ConversationalTestCase]]
257
257
  ] = None,
258
258
  metrics: Optional[
259
259
  Union[
@@ -292,9 +292,10 @@ def validate_evaluate_inputs(
292
292
  if test_cases and metrics:
293
293
  for test_case in test_cases:
294
294
  for metric in metrics:
295
- if isinstance(test_case, LLMTestCase) and not isinstance(
296
- metric, BaseMetric
297
- ):
295
+ if (
296
+ isinstance(test_case, LLMTestCase)
297
+ and not test_case.multimodal
298
+ ) and not isinstance(metric, BaseMetric):
298
299
  raise ValueError(
299
300
  f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
300
301
  )
@@ -305,11 +306,14 @@ def validate_evaluate_inputs(
305
306
  raise ValueError(
306
307
  f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
307
308
  )
308
- if isinstance(test_case, MLLMTestCase) and not isinstance(
309
- metric, BaseMultimodalMetric
309
+ if (
310
+ isinstance(test_case, LLMTestCase) and test_case.multimodal
311
+ ) and not (
312
+ isinstance(metric, BaseMultimodalMetric)
313
+ or isinstance(metric, BaseMetric)
310
314
  ):
311
315
  raise ValueError(
312
- f"Metric {metric.__name__} is not a valid metric for MLLMTestCase."
316
+ f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
313
317
  )
314
318
 
315
319
 
@@ -1,12 +1,29 @@
1
1
  import warnings
2
+ from typing import TYPE_CHECKING, Any
2
3
 
3
4
  try:
4
- from pydantic_ai.agent import Agent
5
+ from pydantic_ai.agent import Agent as _BaseAgent
5
6
 
6
7
  is_pydantic_ai_installed = True
7
- except:
8
+ except ImportError:
8
9
  is_pydantic_ai_installed = False
9
10
 
11
+ class _BaseAgent:
12
+ """Dummy fallback so imports don't crash when pydantic-ai is missing."""
13
+
14
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
15
+ # No-op: for compatibility
16
+ pass
17
+
18
+
19
+ if TYPE_CHECKING:
20
+ # For type checkers: use the real Agent if available.
21
+ from pydantic_ai.agent import Agent # type: ignore[unused-ignore]
22
+ else:
23
+ # At runtime we always have some base: real Agent or our dummy.
24
+ # This is just to avoid blow-ups.
25
+ Agent = _BaseAgent
26
+
10
27
 
11
28
  class DeepEvalPydanticAIAgent(Agent):
12
29
 
@@ -1,40 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import os
4
6
  from time import perf_counter
5
- from typing import Literal, Optional, List
7
+ from typing import Any, List, Optional, TYPE_CHECKING
6
8
 
7
9
  from deepeval.config.settings import get_settings
8
10
  from deepeval.confident.api import get_confident_api_key
9
11
  from deepeval.metrics.base_metric import BaseMetric
10
12
  from deepeval.prompt import Prompt
11
13
  from deepeval.tracing.context import current_trace_context
12
- from deepeval.tracing.types import Trace
13
- from deepeval.tracing.otel.utils import to_hex_string
14
- from deepeval.tracing.tracing import trace_manager
15
- from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
16
14
  from deepeval.tracing.otel.exporter import ConfidentSpanExporter
17
-
15
+ from deepeval.tracing.otel.test_exporter import test_exporter
16
+ from deepeval.tracing.otel.utils import (
17
+ normalize_pydantic_ai_messages,
18
+ to_hex_string,
19
+ )
20
+ from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
21
+ from deepeval.tracing.tracing import trace_manager
22
+ from deepeval.tracing.types import (
23
+ AgentSpan,
24
+ Trace,
25
+ TraceSpanStatus,
26
+ ToolCall,
27
+ )
18
28
 
19
29
  logger = logging.getLogger(__name__)
20
30
 
21
-
22
31
  try:
23
- from pydantic_ai.models.instrumented import InstrumentationSettings
24
- from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
32
+ # Optional dependencies
33
+ from opentelemetry.sdk.trace import (
34
+ ReadableSpan as _ReadableSpan,
35
+ SpanProcessor as _SpanProcessor,
36
+ TracerProvider,
37
+ )
25
38
  from opentelemetry.sdk.trace.export import BatchSpanProcessor
26
39
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
27
40
  OTLPSpanExporter,
28
41
  )
29
- from opentelemetry.sdk.trace import ReadableSpan
42
+ from pydantic_ai.models.instrumented import (
43
+ InstrumentationSettings as _BaseInstrumentationSettings,
44
+ )
30
45
 
31
46
  dependency_installed = True
32
47
  except ImportError as e:
48
+ dependency_installed = False
49
+
50
+ # Preserve previous behavior: only log when verbose mode is enabled.
33
51
  if get_settings().DEEPEVAL_VERBOSE_MODE:
34
52
  if isinstance(e, ModuleNotFoundError):
35
53
  logger.warning(
36
54
  "Optional tracing dependency not installed: %s",
37
- e.name,
55
+ getattr(e, "name", repr(e)),
38
56
  stacklevel=2,
39
57
  )
40
58
  else:
@@ -43,26 +61,47 @@ except ImportError as e:
43
61
  e,
44
62
  stacklevel=2,
45
63
  )
46
- dependency_installed = False
64
+
65
+ # Dummy fallbacks so imports and class definitions don't crash when
66
+ # optional deps are missing. Actual use is still guarded by
67
+ # is_dependency_installed().
68
+ class _BaseInstrumentationSettings:
69
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
70
+ pass
71
+
72
+ class _SpanProcessor:
73
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
74
+ pass
75
+
76
+ def on_start(self, span: Any, parent_context: Any) -> None:
77
+ pass
78
+
79
+ def on_end(self, span: Any) -> None:
80
+ pass
81
+
82
+ class _ReadableSpan:
83
+ pass
47
84
 
48
85
 
49
- def is_dependency_installed():
86
+ def is_dependency_installed() -> bool:
50
87
  if not dependency_installed:
51
88
  raise ImportError(
52
- "Dependencies are not installed. Please install it with `pip install pydantic-ai opentelemetry-sdk opentelemetry-exporter-otlp-proto-http`."
89
+ "Dependencies are not installed. Please install it with "
90
+ "`pip install pydantic-ai opentelemetry-sdk "
91
+ "opentelemetry-exporter-otlp-proto-http`."
53
92
  )
54
93
  return True
55
94
 
56
95
 
57
- from deepeval.tracing.types import AgentSpan
58
- from deepeval.confident.api import get_confident_api_key
59
- from deepeval.prompt import Prompt
60
- from deepeval.tracing.otel.test_exporter import test_exporter
61
- from deepeval.tracing.context import current_trace_context
62
- from deepeval.tracing.types import Trace
63
- from deepeval.tracing.otel.utils import to_hex_string
64
- from deepeval.tracing.types import TraceSpanStatus, ToolCall
65
- from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
96
+ if TYPE_CHECKING:
97
+ # For type checkers, use real types
98
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
99
+ from pydantic_ai.models.instrumented import InstrumentationSettings
100
+ else:
101
+ # At runtime we always have something to subclass / annotate with
102
+ InstrumentationSettings = _BaseInstrumentationSettings
103
+ SpanProcessor = _SpanProcessor
104
+ ReadableSpan = _ReadableSpan
66
105
 
67
106
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
68
107
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
@@ -42,6 +42,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
42
42
  from .turn_relevancy.turn_relevancy import (
43
43
  TurnRelevancyMetric,
44
44
  )
45
+ from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
46
+ from .turn_contextual_precision.turn_contextual_precision import (
47
+ TurnContextualPrecisionMetric,
48
+ )
49
+ from .turn_contextual_recall.turn_contextual_recall import (
50
+ TurnContextualRecallMetric,
51
+ )
52
+ from .turn_contextual_relevancy.turn_contextual_relevancy import (
53
+ TurnContextualRelevancyMetric,
54
+ )
45
55
  from .conversation_completeness.conversation_completeness import (
46
56
  ConversationCompletenessMetric,
47
57
  )
@@ -55,12 +65,6 @@ from .multimodal_metrics import (
55
65
  ImageCoherenceMetric,
56
66
  ImageHelpfulnessMetric,
57
67
  ImageReferenceMetric,
58
- MultimodalContextualRecallMetric,
59
- MultimodalContextualRelevancyMetric,
60
- MultimodalContextualPrecisionMetric,
61
- MultimodalAnswerRelevancyMetric,
62
- MultimodalFaithfulnessMetric,
63
- MultimodalToolCorrectnessMetric,
64
68
  MultimodalGEval,
65
69
  )
66
70
 
@@ -119,17 +123,15 @@ __all__ = [
119
123
  # Conversational metrics
120
124
  "TurnRelevancyMetric",
121
125
  "ConversationCompletenessMetric",
126
+ "TurnFaithfulnessMetric",
127
+ "TurnContextualPrecisionMetric",
128
+ "TurnContextualRecallMetric",
129
+ "TurnContextualRelevancyMetric",
122
130
  # Multimodal metrics
123
131
  "TextToImageMetric",
124
132
  "ImageEditingMetric",
125
133
  "ImageCoherenceMetric",
126
134
  "ImageHelpfulnessMetric",
127
135
  "ImageReferenceMetric",
128
- "MultimodalContextualRecallMetric",
129
- "MultimodalContextualRelevancyMetric",
130
- "MultimodalContextualPrecisionMetric",
131
- "MultimodalAnswerRelevancyMetric",
132
- "MultimodalFaithfulnessMetric",
133
- "MultimodalToolCorrectnessMetric",
134
136
  "MultimodalGEval",
135
137
  ]
@@ -1,16 +1,17 @@
1
1
  from typing import Optional, List, Type, Union
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import (
4
+ get_or_create_event_loop,
5
+ prettify_list,
6
+ )
4
7
  from deepeval.metrics.utils import (
5
8
  construct_verbose_logs,
6
9
  trimAndLoadJson,
7
10
  check_llm_test_case_params,
11
+ check_mllm_test_case_params,
8
12
  initialize_model,
9
13
  )
10
- from deepeval.test_case import (
11
- LLMTestCase,
12
- LLMTestCaseParams,
13
- )
14
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
14
15
  from deepeval.metrics import BaseMetric
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
@@ -53,7 +54,14 @@ class AnswerRelevancyMetric(BaseMetric):
53
54
  _in_component: bool = False,
54
55
  _log_metric_to_confident: bool = True,
55
56
  ) -> float:
56
- check_llm_test_case_params(test_case, self._required_params, self)
57
+
58
+ multimodal = test_case.multimodal
59
+ if multimodal:
60
+ check_mllm_test_case_params(
61
+ test_case, self._required_params, None, None, self, self.model
62
+ )
63
+ else:
64
+ check_llm_test_case_params(test_case, self._required_params, self)
57
65
 
58
66
  self.evaluation_cost = 0 if self.using_native_model else None
59
67
  with metric_progress_indicator(
@@ -70,14 +78,17 @@ class AnswerRelevancyMetric(BaseMetric):
70
78
  )
71
79
  )
72
80
  else:
81
+ input = test_case.input
82
+ actual_output = test_case.actual_output
83
+
73
84
  self.statements: List[str] = self._generate_statements(
74
- test_case.actual_output
85
+ actual_output, multimodal
75
86
  )
76
87
  self.verdicts: List[AnswerRelevancyVerdict] = (
77
- self._generate_verdicts(test_case.input)
88
+ self._generate_verdicts(input, multimodal)
78
89
  )
79
90
  self.score = self._calculate_score()
80
- self.reason = self._generate_reason(test_case.input)
91
+ self.reason = self._generate_reason(input, multimodal)
81
92
  self.success = self.score >= self.threshold
82
93
  self.verbose_logs = construct_verbose_logs(
83
94
  self,
@@ -101,7 +112,14 @@ class AnswerRelevancyMetric(BaseMetric):
101
112
  _in_component: bool = False,
102
113
  _log_metric_to_confident: bool = True,
103
114
  ) -> float:
104
- check_llm_test_case_params(test_case, self._required_params, self)
115
+
116
+ multimodal = test_case.multimodal
117
+ if multimodal:
118
+ check_mllm_test_case_params(
119
+ test_case, self._required_params, None, None, self, self.model
120
+ )
121
+ else:
122
+ check_llm_test_case_params(test_case, self._required_params, self)
105
123
 
106
124
  self.evaluation_cost = 0 if self.using_native_model else None
107
125
  with metric_progress_indicator(
@@ -110,14 +128,17 @@ class AnswerRelevancyMetric(BaseMetric):
110
128
  _show_indicator=_show_indicator,
111
129
  _in_component=_in_component,
112
130
  ):
131
+ input = test_case.input
132
+ actual_output = test_case.actual_output
133
+
113
134
  self.statements: List[str] = await self._a_generate_statements(
114
- test_case.actual_output
135
+ actual_output, multimodal
115
136
  )
116
137
  self.verdicts: List[AnswerRelevancyVerdict] = (
117
- await self._a_generate_verdicts(test_case.input)
138
+ await self._a_generate_verdicts(input, multimodal)
118
139
  )
119
140
  self.score = self._calculate_score()
120
- self.reason = await self._a_generate_reason(test_case.input)
141
+ self.reason = await self._a_generate_reason(input, multimodal)
121
142
  self.success = self.score >= self.threshold
122
143
  self.verbose_logs = construct_verbose_logs(
123
144
  self,
@@ -133,7 +154,7 @@ class AnswerRelevancyMetric(BaseMetric):
133
154
  )
134
155
  return self.score
135
156
 
136
- async def _a_generate_reason(self, input: str) -> str:
157
+ async def _a_generate_reason(self, input: str, multimodal: bool) -> str:
137
158
  if self.include_reason is False:
138
159
  return None
139
160
 
@@ -146,7 +167,9 @@ class AnswerRelevancyMetric(BaseMetric):
146
167
  irrelevant_statements=irrelevant_statements,
147
168
  input=input,
148
169
  score=format(self.score, ".2f"),
170
+ multimodal=multimodal,
149
171
  )
172
+
150
173
  if self.using_native_model:
151
174
  res, cost = await self.model.a_generate(
152
175
  prompt, schema=AnswerRelevancyScoreReason
@@ -164,7 +187,7 @@ class AnswerRelevancyMetric(BaseMetric):
164
187
  data = trimAndLoadJson(res, self)
165
188
  return data["reason"]
166
189
 
167
- def _generate_reason(self, input: str) -> str:
190
+ def _generate_reason(self, input: str, multimodal: bool) -> str:
168
191
  if self.include_reason is False:
169
192
  return None
170
193
 
@@ -177,6 +200,7 @@ class AnswerRelevancyMetric(BaseMetric):
177
200
  irrelevant_statements=irrelevant_statements,
178
201
  input=input,
179
202
  score=format(self.score, ".2f"),
203
+ multimodal=multimodal,
180
204
  )
181
205
 
182
206
  if self.using_native_model:
@@ -197,14 +221,13 @@ class AnswerRelevancyMetric(BaseMetric):
197
221
  return data["reason"]
198
222
 
199
223
  async def _a_generate_verdicts(
200
- self, input: str
224
+ self, input: str, multimodal: bool
201
225
  ) -> List[AnswerRelevancyVerdict]:
202
226
  if len(self.statements) == 0:
203
227
  return []
204
228
 
205
229
  prompt = self.evaluation_template.generate_verdicts(
206
- input=input,
207
- statements=self.statements,
230
+ input=input, statements=self.statements, multimodal=multimodal
208
231
  )
209
232
 
210
233
  if self.using_native_model:
@@ -224,14 +247,16 @@ class AnswerRelevancyMetric(BaseMetric):
224
247
  AnswerRelevancyVerdict(**item) for item in data["verdicts"]
225
248
  ]
226
249
 
227
- def _generate_verdicts(self, input: str) -> List[AnswerRelevancyVerdict]:
250
+ def _generate_verdicts(
251
+ self, input: str, multimodal: bool
252
+ ) -> List[AnswerRelevancyVerdict]:
228
253
  if len(self.statements) == 0:
229
254
  return []
230
255
 
231
256
  prompt = self.evaluation_template.generate_verdicts(
232
- input=input,
233
- statements=self.statements,
257
+ input=input, statements=self.statements, multimodal=multimodal
234
258
  )
259
+
235
260
  if self.using_native_model:
236
261
  res, cost = self.model.generate(prompt, schema=Verdicts)
237
262
  self.evaluation_cost += cost
@@ -250,44 +275,64 @@ class AnswerRelevancyMetric(BaseMetric):
250
275
  async def _a_generate_statements(
251
276
  self,
252
277
  actual_output: str,
278
+ multimodal: bool,
253
279
  ) -> List[str]:
254
280
  prompt = self.evaluation_template.generate_statements(
255
- actual_output=actual_output,
281
+ actual_output=actual_output, multimodal=multimodal
256
282
  )
257
283
  if self.using_native_model:
258
284
  res, cost = await self.model.a_generate(prompt, schema=Statements)
259
285
  self.evaluation_cost += cost
260
- return res.statements
286
+ statements: List[str] = res.statements + [
287
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
288
+ ]
289
+ return statements
261
290
  else:
262
291
  try:
263
292
  res: Statements = await self.model.a_generate(
264
293
  prompt, schema=Statements
265
294
  )
266
- return res.statements
295
+ statements: List[str] = res.statements + [
296
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
297
+ ]
298
+ return statements
267
299
  except TypeError:
268
300
  res = await self.model.a_generate(prompt)
269
301
  data = trimAndLoadJson(res, self)
270
- return data["statements"]
302
+ statements = data["statements"] + [
303
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
304
+ ]
305
+ return statements
271
306
 
272
307
  def _generate_statements(
273
308
  self,
274
309
  actual_output: str,
310
+ multimodal: bool,
275
311
  ) -> List[str]:
276
312
  prompt = self.evaluation_template.generate_statements(
277
- actual_output=actual_output,
313
+ actual_output=actual_output, multimodal=multimodal
278
314
  )
279
315
  if self.using_native_model:
280
316
  res, cost = self.model.generate(prompt, schema=Statements)
281
317
  self.evaluation_cost += cost
282
- return res.statements
318
+ statements = res.statements + [
319
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
320
+ ]
321
+ return statements
283
322
  else:
284
323
  try:
285
324
  res: Statements = self.model.generate(prompt, schema=Statements)
286
- return res.statements
325
+ statements = res.statements + [
326
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
327
+ ]
328
+ return statements
287
329
  except TypeError:
288
330
  res = self.model.generate(prompt)
289
331
  data = trimAndLoadJson(res, self)
290
- return data["statements"]
332
+ statements = data["statements"] + [
333
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
334
+ ]
335
+ return statements
291
336
 
292
337
  def _calculate_score(self):
293
338
  number_of_verdicts = len(self.verdicts)