deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/utils.py
CHANGED
|
@@ -16,7 +16,6 @@ from deepeval.metrics import (
|
|
|
16
16
|
from deepeval.test_case import (
|
|
17
17
|
LLMTestCase,
|
|
18
18
|
ConversationalTestCase,
|
|
19
|
-
MLLMTestCase,
|
|
20
19
|
)
|
|
21
20
|
from deepeval.test_run import (
|
|
22
21
|
LLMApiTestCase,
|
|
@@ -129,17 +128,14 @@ def create_test_result(
|
|
|
129
128
|
turns=api_test_case.turns,
|
|
130
129
|
)
|
|
131
130
|
else:
|
|
132
|
-
multimodal =
|
|
133
|
-
api_test_case.multimodal_input is not None
|
|
134
|
-
and api_test_case.multimodal_input_actual_output is not None
|
|
135
|
-
)
|
|
131
|
+
multimodal = api_test_case.images_mapping
|
|
136
132
|
if multimodal:
|
|
137
133
|
return TestResult(
|
|
138
134
|
name=name,
|
|
139
135
|
success=api_test_case.success,
|
|
140
136
|
metrics_data=api_test_case.metrics_data,
|
|
141
|
-
input=api_test_case.
|
|
142
|
-
actual_output=api_test_case.
|
|
137
|
+
input=api_test_case.input,
|
|
138
|
+
actual_output=api_test_case.actual_output,
|
|
143
139
|
conversational=False,
|
|
144
140
|
multimodal=True,
|
|
145
141
|
additional_metadata=api_test_case.additional_metadata,
|
|
@@ -222,9 +218,9 @@ def validate_assert_test_inputs(
|
|
|
222
218
|
)
|
|
223
219
|
|
|
224
220
|
if test_case and metrics:
|
|
225
|
-
if
|
|
226
|
-
isinstance(
|
|
227
|
-
):
|
|
221
|
+
if (
|
|
222
|
+
isinstance(test_case, LLMTestCase) and not test_case.multimodal
|
|
223
|
+
) and not all(isinstance(metric, BaseMetric) for metric in metrics):
|
|
228
224
|
raise ValueError(
|
|
229
225
|
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
230
226
|
)
|
|
@@ -234,11 +230,17 @@ def validate_assert_test_inputs(
|
|
|
234
230
|
raise ValueError(
|
|
235
231
|
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
236
232
|
)
|
|
237
|
-
if
|
|
238
|
-
isinstance(
|
|
233
|
+
if (
|
|
234
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
235
|
+
) and not all(
|
|
236
|
+
(
|
|
237
|
+
isinstance(metric, BaseMultimodalMetric)
|
|
238
|
+
or isinstance(metric, BaseMetric)
|
|
239
|
+
)
|
|
240
|
+
for metric in metrics
|
|
239
241
|
):
|
|
240
242
|
raise ValueError(
|
|
241
|
-
"All 'metrics' for
|
|
243
|
+
"All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
|
|
242
244
|
)
|
|
243
245
|
|
|
244
246
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
@@ -251,9 +253,7 @@ def validate_evaluate_inputs(
|
|
|
251
253
|
goldens: Optional[List] = None,
|
|
252
254
|
observed_callback: Optional[Callable] = None,
|
|
253
255
|
test_cases: Optional[
|
|
254
|
-
Union[
|
|
255
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
256
|
-
]
|
|
256
|
+
Union[List[LLMTestCase], List[ConversationalTestCase]]
|
|
257
257
|
] = None,
|
|
258
258
|
metrics: Optional[
|
|
259
259
|
Union[
|
|
@@ -292,9 +292,10 @@ def validate_evaluate_inputs(
|
|
|
292
292
|
if test_cases and metrics:
|
|
293
293
|
for test_case in test_cases:
|
|
294
294
|
for metric in metrics:
|
|
295
|
-
if
|
|
296
|
-
|
|
297
|
-
|
|
295
|
+
if (
|
|
296
|
+
isinstance(test_case, LLMTestCase)
|
|
297
|
+
and not test_case.multimodal
|
|
298
|
+
) and not isinstance(metric, BaseMetric):
|
|
298
299
|
raise ValueError(
|
|
299
300
|
f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
|
|
300
301
|
)
|
|
@@ -305,11 +306,14 @@ def validate_evaluate_inputs(
|
|
|
305
306
|
raise ValueError(
|
|
306
307
|
f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
|
|
307
308
|
)
|
|
308
|
-
if
|
|
309
|
-
|
|
309
|
+
if (
|
|
310
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
311
|
+
) and not (
|
|
312
|
+
isinstance(metric, BaseMultimodalMetric)
|
|
313
|
+
or isinstance(metric, BaseMetric)
|
|
310
314
|
):
|
|
311
315
|
raise ValueError(
|
|
312
|
-
f"Metric {metric.__name__} is not a valid metric for
|
|
316
|
+
f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
|
|
313
317
|
)
|
|
314
318
|
|
|
315
319
|
|
|
@@ -1,12 +1,29 @@
|
|
|
1
1
|
import warnings
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
2
3
|
|
|
3
4
|
try:
|
|
4
|
-
from pydantic_ai.agent import Agent
|
|
5
|
+
from pydantic_ai.agent import Agent as _BaseAgent
|
|
5
6
|
|
|
6
7
|
is_pydantic_ai_installed = True
|
|
7
|
-
except:
|
|
8
|
+
except ImportError:
|
|
8
9
|
is_pydantic_ai_installed = False
|
|
9
10
|
|
|
11
|
+
class _BaseAgent:
|
|
12
|
+
"""Dummy fallback so imports don't crash when pydantic-ai is missing."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
15
|
+
# No-op: for compatibility
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
# For type checkers: use the real Agent if available.
|
|
21
|
+
from pydantic_ai.agent import Agent # type: ignore[unused-ignore]
|
|
22
|
+
else:
|
|
23
|
+
# At runtime we always have some base: real Agent or our dummy.
|
|
24
|
+
# This is just to avoid blow-ups.
|
|
25
|
+
Agent = _BaseAgent
|
|
26
|
+
|
|
10
27
|
|
|
11
28
|
class DeepEvalPydanticAIAgent(Agent):
|
|
12
29
|
|
|
@@ -1,40 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
4
6
|
from time import perf_counter
|
|
5
|
-
from typing import
|
|
7
|
+
from typing import Any, List, Optional, TYPE_CHECKING
|
|
6
8
|
|
|
7
9
|
from deepeval.config.settings import get_settings
|
|
8
10
|
from deepeval.confident.api import get_confident_api_key
|
|
9
11
|
from deepeval.metrics.base_metric import BaseMetric
|
|
10
12
|
from deepeval.prompt import Prompt
|
|
11
13
|
from deepeval.tracing.context import current_trace_context
|
|
12
|
-
from deepeval.tracing.types import Trace
|
|
13
|
-
from deepeval.tracing.otel.utils import to_hex_string
|
|
14
|
-
from deepeval.tracing.tracing import trace_manager
|
|
15
|
-
from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
|
|
16
14
|
from deepeval.tracing.otel.exporter import ConfidentSpanExporter
|
|
17
|
-
|
|
15
|
+
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
16
|
+
from deepeval.tracing.otel.utils import (
|
|
17
|
+
normalize_pydantic_ai_messages,
|
|
18
|
+
to_hex_string,
|
|
19
|
+
)
|
|
20
|
+
from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
|
|
21
|
+
from deepeval.tracing.tracing import trace_manager
|
|
22
|
+
from deepeval.tracing.types import (
|
|
23
|
+
AgentSpan,
|
|
24
|
+
Trace,
|
|
25
|
+
TraceSpanStatus,
|
|
26
|
+
ToolCall,
|
|
27
|
+
)
|
|
18
28
|
|
|
19
29
|
logger = logging.getLogger(__name__)
|
|
20
30
|
|
|
21
|
-
|
|
22
31
|
try:
|
|
23
|
-
|
|
24
|
-
from opentelemetry.sdk.trace import
|
|
32
|
+
# Optional dependencies
|
|
33
|
+
from opentelemetry.sdk.trace import (
|
|
34
|
+
ReadableSpan as _ReadableSpan,
|
|
35
|
+
SpanProcessor as _SpanProcessor,
|
|
36
|
+
TracerProvider,
|
|
37
|
+
)
|
|
25
38
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
26
39
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
27
40
|
OTLPSpanExporter,
|
|
28
41
|
)
|
|
29
|
-
from
|
|
42
|
+
from pydantic_ai.models.instrumented import (
|
|
43
|
+
InstrumentationSettings as _BaseInstrumentationSettings,
|
|
44
|
+
)
|
|
30
45
|
|
|
31
46
|
dependency_installed = True
|
|
32
47
|
except ImportError as e:
|
|
48
|
+
dependency_installed = False
|
|
49
|
+
|
|
50
|
+
# Preserve previous behavior: only log when verbose mode is enabled.
|
|
33
51
|
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
34
52
|
if isinstance(e, ModuleNotFoundError):
|
|
35
53
|
logger.warning(
|
|
36
54
|
"Optional tracing dependency not installed: %s",
|
|
37
|
-
e
|
|
55
|
+
getattr(e, "name", repr(e)),
|
|
38
56
|
stacklevel=2,
|
|
39
57
|
)
|
|
40
58
|
else:
|
|
@@ -43,26 +61,47 @@ except ImportError as e:
|
|
|
43
61
|
e,
|
|
44
62
|
stacklevel=2,
|
|
45
63
|
)
|
|
46
|
-
|
|
64
|
+
|
|
65
|
+
# Dummy fallbacks so imports and class definitions don't crash when
|
|
66
|
+
# optional deps are missing. Actual use is still guarded by
|
|
67
|
+
# is_dependency_installed().
|
|
68
|
+
class _BaseInstrumentationSettings:
|
|
69
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class _SpanProcessor:
|
|
73
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def on_start(self, span: Any, parent_context: Any) -> None:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
def on_end(self, span: Any) -> None:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
class _ReadableSpan:
|
|
83
|
+
pass
|
|
47
84
|
|
|
48
85
|
|
|
49
|
-
def is_dependency_installed():
|
|
86
|
+
def is_dependency_installed() -> bool:
|
|
50
87
|
if not dependency_installed:
|
|
51
88
|
raise ImportError(
|
|
52
|
-
"Dependencies are not installed. Please install it with
|
|
89
|
+
"Dependencies are not installed. Please install it with "
|
|
90
|
+
"`pip install pydantic-ai opentelemetry-sdk "
|
|
91
|
+
"opentelemetry-exporter-otlp-proto-http`."
|
|
53
92
|
)
|
|
54
93
|
return True
|
|
55
94
|
|
|
56
95
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
from
|
|
60
|
-
from
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
96
|
+
if TYPE_CHECKING:
|
|
97
|
+
# For type checkers, use real types
|
|
98
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
99
|
+
from pydantic_ai.models.instrumented import InstrumentationSettings
|
|
100
|
+
else:
|
|
101
|
+
# At runtime we always have something to subclass / annotate with
|
|
102
|
+
InstrumentationSettings = _BaseInstrumentationSettings
|
|
103
|
+
SpanProcessor = _SpanProcessor
|
|
104
|
+
ReadableSpan = _ReadableSpan
|
|
66
105
|
|
|
67
106
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
68
107
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -42,6 +42,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
|
|
|
42
42
|
from .turn_relevancy.turn_relevancy import (
|
|
43
43
|
TurnRelevancyMetric,
|
|
44
44
|
)
|
|
45
|
+
from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
|
|
46
|
+
from .turn_contextual_precision.turn_contextual_precision import (
|
|
47
|
+
TurnContextualPrecisionMetric,
|
|
48
|
+
)
|
|
49
|
+
from .turn_contextual_recall.turn_contextual_recall import (
|
|
50
|
+
TurnContextualRecallMetric,
|
|
51
|
+
)
|
|
52
|
+
from .turn_contextual_relevancy.turn_contextual_relevancy import (
|
|
53
|
+
TurnContextualRelevancyMetric,
|
|
54
|
+
)
|
|
45
55
|
from .conversation_completeness.conversation_completeness import (
|
|
46
56
|
ConversationCompletenessMetric,
|
|
47
57
|
)
|
|
@@ -55,12 +65,6 @@ from .multimodal_metrics import (
|
|
|
55
65
|
ImageCoherenceMetric,
|
|
56
66
|
ImageHelpfulnessMetric,
|
|
57
67
|
ImageReferenceMetric,
|
|
58
|
-
MultimodalContextualRecallMetric,
|
|
59
|
-
MultimodalContextualRelevancyMetric,
|
|
60
|
-
MultimodalContextualPrecisionMetric,
|
|
61
|
-
MultimodalAnswerRelevancyMetric,
|
|
62
|
-
MultimodalFaithfulnessMetric,
|
|
63
|
-
MultimodalToolCorrectnessMetric,
|
|
64
68
|
MultimodalGEval,
|
|
65
69
|
)
|
|
66
70
|
|
|
@@ -119,17 +123,15 @@ __all__ = [
|
|
|
119
123
|
# Conversational metrics
|
|
120
124
|
"TurnRelevancyMetric",
|
|
121
125
|
"ConversationCompletenessMetric",
|
|
126
|
+
"TurnFaithfulnessMetric",
|
|
127
|
+
"TurnContextualPrecisionMetric",
|
|
128
|
+
"TurnContextualRecallMetric",
|
|
129
|
+
"TurnContextualRelevancyMetric",
|
|
122
130
|
# Multimodal metrics
|
|
123
131
|
"TextToImageMetric",
|
|
124
132
|
"ImageEditingMetric",
|
|
125
133
|
"ImageCoherenceMetric",
|
|
126
134
|
"ImageHelpfulnessMetric",
|
|
127
135
|
"ImageReferenceMetric",
|
|
128
|
-
"MultimodalContextualRecallMetric",
|
|
129
|
-
"MultimodalContextualRelevancyMetric",
|
|
130
|
-
"MultimodalContextualPrecisionMetric",
|
|
131
|
-
"MultimodalAnswerRelevancyMetric",
|
|
132
|
-
"MultimodalFaithfulnessMetric",
|
|
133
|
-
"MultimodalToolCorrectnessMetric",
|
|
134
136
|
"MultimodalGEval",
|
|
135
137
|
]
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from typing import Optional, List, Type, Union
|
|
2
2
|
|
|
3
|
-
from deepeval.utils import
|
|
3
|
+
from deepeval.utils import (
|
|
4
|
+
get_or_create_event_loop,
|
|
5
|
+
prettify_list,
|
|
6
|
+
)
|
|
4
7
|
from deepeval.metrics.utils import (
|
|
5
8
|
construct_verbose_logs,
|
|
6
9
|
trimAndLoadJson,
|
|
7
10
|
check_llm_test_case_params,
|
|
11
|
+
check_mllm_test_case_params,
|
|
8
12
|
initialize_model,
|
|
9
13
|
)
|
|
10
|
-
from deepeval.test_case import
|
|
11
|
-
LLMTestCase,
|
|
12
|
-
LLMTestCaseParams,
|
|
13
|
-
)
|
|
14
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
|
|
14
15
|
from deepeval.metrics import BaseMetric
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
|
|
@@ -53,7 +54,14 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
53
54
|
_in_component: bool = False,
|
|
54
55
|
_log_metric_to_confident: bool = True,
|
|
55
56
|
) -> float:
|
|
56
|
-
|
|
57
|
+
|
|
58
|
+
multimodal = test_case.multimodal
|
|
59
|
+
if multimodal:
|
|
60
|
+
check_mllm_test_case_params(
|
|
61
|
+
test_case, self._required_params, None, None, self, self.model
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
57
65
|
|
|
58
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
67
|
with metric_progress_indicator(
|
|
@@ -70,14 +78,17 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
70
78
|
)
|
|
71
79
|
)
|
|
72
80
|
else:
|
|
81
|
+
input = test_case.input
|
|
82
|
+
actual_output = test_case.actual_output
|
|
83
|
+
|
|
73
84
|
self.statements: List[str] = self._generate_statements(
|
|
74
|
-
|
|
85
|
+
actual_output, multimodal
|
|
75
86
|
)
|
|
76
87
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
77
|
-
self._generate_verdicts(
|
|
88
|
+
self._generate_verdicts(input, multimodal)
|
|
78
89
|
)
|
|
79
90
|
self.score = self._calculate_score()
|
|
80
|
-
self.reason = self._generate_reason(
|
|
91
|
+
self.reason = self._generate_reason(input, multimodal)
|
|
81
92
|
self.success = self.score >= self.threshold
|
|
82
93
|
self.verbose_logs = construct_verbose_logs(
|
|
83
94
|
self,
|
|
@@ -101,7 +112,14 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
101
112
|
_in_component: bool = False,
|
|
102
113
|
_log_metric_to_confident: bool = True,
|
|
103
114
|
) -> float:
|
|
104
|
-
|
|
115
|
+
|
|
116
|
+
multimodal = test_case.multimodal
|
|
117
|
+
if multimodal:
|
|
118
|
+
check_mllm_test_case_params(
|
|
119
|
+
test_case, self._required_params, None, None, self, self.model
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
105
123
|
|
|
106
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
107
125
|
with metric_progress_indicator(
|
|
@@ -110,14 +128,17 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
110
128
|
_show_indicator=_show_indicator,
|
|
111
129
|
_in_component=_in_component,
|
|
112
130
|
):
|
|
131
|
+
input = test_case.input
|
|
132
|
+
actual_output = test_case.actual_output
|
|
133
|
+
|
|
113
134
|
self.statements: List[str] = await self._a_generate_statements(
|
|
114
|
-
|
|
135
|
+
actual_output, multimodal
|
|
115
136
|
)
|
|
116
137
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
117
|
-
await self._a_generate_verdicts(
|
|
138
|
+
await self._a_generate_verdicts(input, multimodal)
|
|
118
139
|
)
|
|
119
140
|
self.score = self._calculate_score()
|
|
120
|
-
self.reason = await self._a_generate_reason(
|
|
141
|
+
self.reason = await self._a_generate_reason(input, multimodal)
|
|
121
142
|
self.success = self.score >= self.threshold
|
|
122
143
|
self.verbose_logs = construct_verbose_logs(
|
|
123
144
|
self,
|
|
@@ -133,7 +154,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
133
154
|
)
|
|
134
155
|
return self.score
|
|
135
156
|
|
|
136
|
-
async def _a_generate_reason(self, input: str) -> str:
|
|
157
|
+
async def _a_generate_reason(self, input: str, multimodal: bool) -> str:
|
|
137
158
|
if self.include_reason is False:
|
|
138
159
|
return None
|
|
139
160
|
|
|
@@ -146,7 +167,9 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
146
167
|
irrelevant_statements=irrelevant_statements,
|
|
147
168
|
input=input,
|
|
148
169
|
score=format(self.score, ".2f"),
|
|
170
|
+
multimodal=multimodal,
|
|
149
171
|
)
|
|
172
|
+
|
|
150
173
|
if self.using_native_model:
|
|
151
174
|
res, cost = await self.model.a_generate(
|
|
152
175
|
prompt, schema=AnswerRelevancyScoreReason
|
|
@@ -164,7 +187,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
164
187
|
data = trimAndLoadJson(res, self)
|
|
165
188
|
return data["reason"]
|
|
166
189
|
|
|
167
|
-
def _generate_reason(self, input: str) -> str:
|
|
190
|
+
def _generate_reason(self, input: str, multimodal: bool) -> str:
|
|
168
191
|
if self.include_reason is False:
|
|
169
192
|
return None
|
|
170
193
|
|
|
@@ -177,6 +200,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
177
200
|
irrelevant_statements=irrelevant_statements,
|
|
178
201
|
input=input,
|
|
179
202
|
score=format(self.score, ".2f"),
|
|
203
|
+
multimodal=multimodal,
|
|
180
204
|
)
|
|
181
205
|
|
|
182
206
|
if self.using_native_model:
|
|
@@ -197,14 +221,13 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
197
221
|
return data["reason"]
|
|
198
222
|
|
|
199
223
|
async def _a_generate_verdicts(
|
|
200
|
-
self, input: str
|
|
224
|
+
self, input: str, multimodal: bool
|
|
201
225
|
) -> List[AnswerRelevancyVerdict]:
|
|
202
226
|
if len(self.statements) == 0:
|
|
203
227
|
return []
|
|
204
228
|
|
|
205
229
|
prompt = self.evaluation_template.generate_verdicts(
|
|
206
|
-
input=input,
|
|
207
|
-
statements=self.statements,
|
|
230
|
+
input=input, statements=self.statements, multimodal=multimodal
|
|
208
231
|
)
|
|
209
232
|
|
|
210
233
|
if self.using_native_model:
|
|
@@ -224,14 +247,16 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
224
247
|
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
225
248
|
]
|
|
226
249
|
|
|
227
|
-
def _generate_verdicts(
|
|
250
|
+
def _generate_verdicts(
|
|
251
|
+
self, input: str, multimodal: bool
|
|
252
|
+
) -> List[AnswerRelevancyVerdict]:
|
|
228
253
|
if len(self.statements) == 0:
|
|
229
254
|
return []
|
|
230
255
|
|
|
231
256
|
prompt = self.evaluation_template.generate_verdicts(
|
|
232
|
-
input=input,
|
|
233
|
-
statements=self.statements,
|
|
257
|
+
input=input, statements=self.statements, multimodal=multimodal
|
|
234
258
|
)
|
|
259
|
+
|
|
235
260
|
if self.using_native_model:
|
|
236
261
|
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
237
262
|
self.evaluation_cost += cost
|
|
@@ -250,44 +275,64 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
250
275
|
async def _a_generate_statements(
|
|
251
276
|
self,
|
|
252
277
|
actual_output: str,
|
|
278
|
+
multimodal: bool,
|
|
253
279
|
) -> List[str]:
|
|
254
280
|
prompt = self.evaluation_template.generate_statements(
|
|
255
|
-
actual_output=actual_output,
|
|
281
|
+
actual_output=actual_output, multimodal=multimodal
|
|
256
282
|
)
|
|
257
283
|
if self.using_native_model:
|
|
258
284
|
res, cost = await self.model.a_generate(prompt, schema=Statements)
|
|
259
285
|
self.evaluation_cost += cost
|
|
260
|
-
|
|
286
|
+
statements: List[str] = res.statements + [
|
|
287
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
288
|
+
]
|
|
289
|
+
return statements
|
|
261
290
|
else:
|
|
262
291
|
try:
|
|
263
292
|
res: Statements = await self.model.a_generate(
|
|
264
293
|
prompt, schema=Statements
|
|
265
294
|
)
|
|
266
|
-
|
|
295
|
+
statements: List[str] = res.statements + [
|
|
296
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
297
|
+
]
|
|
298
|
+
return statements
|
|
267
299
|
except TypeError:
|
|
268
300
|
res = await self.model.a_generate(prompt)
|
|
269
301
|
data = trimAndLoadJson(res, self)
|
|
270
|
-
|
|
302
|
+
statements = data["statements"] + [
|
|
303
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
304
|
+
]
|
|
305
|
+
return statements
|
|
271
306
|
|
|
272
307
|
def _generate_statements(
|
|
273
308
|
self,
|
|
274
309
|
actual_output: str,
|
|
310
|
+
multimodal: bool,
|
|
275
311
|
) -> List[str]:
|
|
276
312
|
prompt = self.evaluation_template.generate_statements(
|
|
277
|
-
actual_output=actual_output,
|
|
313
|
+
actual_output=actual_output, multimodal=multimodal
|
|
278
314
|
)
|
|
279
315
|
if self.using_native_model:
|
|
280
316
|
res, cost = self.model.generate(prompt, schema=Statements)
|
|
281
317
|
self.evaluation_cost += cost
|
|
282
|
-
|
|
318
|
+
statements = res.statements + [
|
|
319
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
320
|
+
]
|
|
321
|
+
return statements
|
|
283
322
|
else:
|
|
284
323
|
try:
|
|
285
324
|
res: Statements = self.model.generate(prompt, schema=Statements)
|
|
286
|
-
|
|
325
|
+
statements = res.statements + [
|
|
326
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
327
|
+
]
|
|
328
|
+
return statements
|
|
287
329
|
except TypeError:
|
|
288
330
|
res = self.model.generate(prompt)
|
|
289
331
|
data = trimAndLoadJson(res, self)
|
|
290
|
-
|
|
332
|
+
statements = data["statements"] + [
|
|
333
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
334
|
+
]
|
|
335
|
+
return statements
|
|
291
336
|
|
|
292
337
|
def _calculate_score(self):
|
|
293
338
|
number_of_verdicts = len(self.verdicts)
|