deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.6"
|
deepeval/config/settings.py
CHANGED
|
@@ -27,6 +27,7 @@ from pydantic import (
|
|
|
27
27
|
field_validator,
|
|
28
28
|
model_validator,
|
|
29
29
|
SecretStr,
|
|
30
|
+
PositiveFloat,
|
|
30
31
|
)
|
|
31
32
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
32
33
|
from typing import Any, Dict, List, Optional, NamedTuple
|
|
@@ -317,6 +318,19 @@ class Settings(BaseSettings):
|
|
|
317
318
|
|
|
318
319
|
# Anthropic
|
|
319
320
|
ANTHROPIC_API_KEY: Optional[SecretStr] = None
|
|
321
|
+
ANTHROPIC_MODEL_NAME: Optional[str] = None
|
|
322
|
+
ANTHROPIC_COST_PER_INPUT_TOKEN: Optional[PositiveFloat] = None
|
|
323
|
+
ANTHROPIC_COST_PER_OUTPUT_TOKEN: Optional[PositiveFloat] = None
|
|
324
|
+
|
|
325
|
+
# AWS
|
|
326
|
+
AWS_ACCESS_KEY_ID: Optional[SecretStr] = None
|
|
327
|
+
AWS_SECRET_ACCESS_KEY: Optional[SecretStr] = None
|
|
328
|
+
# AWS Bedrock
|
|
329
|
+
USE_AWS_BEDROCK_MODEL: Optional[bool] = None
|
|
330
|
+
AWS_BEDROCK_MODEL_NAME: Optional[str] = None
|
|
331
|
+
AWS_BEDROCK_REGION: Optional[str] = None
|
|
332
|
+
AWS_BEDROCK_COST_PER_INPUT_TOKEN: Optional[PositiveFloat] = None
|
|
333
|
+
AWS_BEDROCK_COST_PER_OUTPUT_TOKEN: Optional[PositiveFloat] = None
|
|
320
334
|
# Azure Open AI
|
|
321
335
|
AZURE_OPENAI_API_KEY: Optional[SecretStr] = None
|
|
322
336
|
AZURE_OPENAI_ENDPOINT: Optional[AnyUrl] = None
|
|
@@ -329,6 +343,8 @@ class Settings(BaseSettings):
|
|
|
329
343
|
USE_DEEPSEEK_MODEL: Optional[bool] = None
|
|
330
344
|
DEEPSEEK_API_KEY: Optional[SecretStr] = None
|
|
331
345
|
DEEPSEEK_MODEL_NAME: Optional[str] = None
|
|
346
|
+
DEEPSEEK_COST_PER_INPUT_TOKEN: Optional[float] = None
|
|
347
|
+
DEEPSEEK_COST_PER_OUTPUT_TOKEN: Optional[float] = None
|
|
332
348
|
# Gemini
|
|
333
349
|
USE_GEMINI_MODEL: Optional[bool] = None
|
|
334
350
|
GOOGLE_API_KEY: Optional[SecretStr] = None
|
|
@@ -336,11 +352,13 @@ class Settings(BaseSettings):
|
|
|
336
352
|
GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
|
|
337
353
|
GOOGLE_CLOUD_PROJECT: Optional[str] = None
|
|
338
354
|
GOOGLE_CLOUD_LOCATION: Optional[str] = None
|
|
339
|
-
GOOGLE_SERVICE_ACCOUNT_KEY: Optional[
|
|
355
|
+
GOOGLE_SERVICE_ACCOUNT_KEY: Optional[SecretStr] = None
|
|
340
356
|
# Grok
|
|
341
357
|
USE_GROK_MODEL: Optional[bool] = None
|
|
342
358
|
GROK_API_KEY: Optional[SecretStr] = None
|
|
343
359
|
GROK_MODEL_NAME: Optional[str] = None
|
|
360
|
+
GROK_COST_PER_INPUT_TOKEN: Optional[float] = None
|
|
361
|
+
GROK_COST_PER_OUTPUT_TOKEN: Optional[float] = None
|
|
344
362
|
# LiteLLM
|
|
345
363
|
USE_LITELLM: Optional[bool] = None
|
|
346
364
|
LITELLM_API_KEY: Optional[SecretStr] = None
|
|
@@ -362,6 +380,8 @@ class Settings(BaseSettings):
|
|
|
362
380
|
USE_MOONSHOT_MODEL: Optional[bool] = None
|
|
363
381
|
MOONSHOT_API_KEY: Optional[SecretStr] = None
|
|
364
382
|
MOONSHOT_MODEL_NAME: Optional[str] = None
|
|
383
|
+
MOONSHOT_COST_PER_INPUT_TOKEN: Optional[float] = None
|
|
384
|
+
MOONSHOT_COST_PER_OUTPUT_TOKEN: Optional[float] = None
|
|
365
385
|
# Ollama
|
|
366
386
|
OLLAMA_MODEL_NAME: Optional[str] = None
|
|
367
387
|
# OpenAI
|
|
@@ -388,6 +408,7 @@ class Settings(BaseSettings):
|
|
|
388
408
|
|
|
389
409
|
# Azure OpenAI
|
|
390
410
|
USE_AZURE_OPENAI_EMBEDDING: Optional[bool] = None
|
|
411
|
+
AZURE_EMBEDDING_MODEL_NAME: Optional[str] = None
|
|
391
412
|
AZURE_EMBEDDING_DEPLOYMENT_NAME: Optional[str] = None
|
|
392
413
|
# Local
|
|
393
414
|
USE_LOCAL_EMBEDDINGS: Optional[bool] = None
|
|
@@ -614,6 +635,7 @@ class Settings(BaseSettings):
|
|
|
614
635
|
"SKIP_DEEPEVAL_MISSING_PARAMS",
|
|
615
636
|
"TOKENIZERS_PARALLELISM",
|
|
616
637
|
"TRANSFORMERS_NO_ADVISORY_WARNINGS",
|
|
638
|
+
"USE_AWS_BEDROCK_MODEL",
|
|
617
639
|
"USE_OPENAI_MODEL",
|
|
618
640
|
"USE_AZURE_OPENAI",
|
|
619
641
|
"USE_LOCAL_MODEL",
|
|
@@ -647,6 +669,8 @@ class Settings(BaseSettings):
|
|
|
647
669
|
@field_validator(
|
|
648
670
|
"OPENAI_COST_PER_INPUT_TOKEN",
|
|
649
671
|
"OPENAI_COST_PER_OUTPUT_TOKEN",
|
|
672
|
+
"AWS_BEDROCK_COST_PER_INPUT_TOKEN",
|
|
673
|
+
"AWS_BEDROCK_COST_PER_OUTPUT_TOKEN",
|
|
650
674
|
"TEMPERATURE",
|
|
651
675
|
"CONFIDENT_TRACE_SAMPLE_RATE",
|
|
652
676
|
"CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
|
|
@@ -717,6 +741,16 @@ class Settings(BaseSettings):
|
|
|
717
741
|
return None
|
|
718
742
|
return s.upper()
|
|
719
743
|
|
|
744
|
+
@field_validator("AWS_BEDROCK_REGION", mode="before")
|
|
745
|
+
@classmethod
|
|
746
|
+
def _normalize_lower(cls, v):
|
|
747
|
+
if v is None:
|
|
748
|
+
return None
|
|
749
|
+
s = str(v).strip()
|
|
750
|
+
if not s:
|
|
751
|
+
return None
|
|
752
|
+
return s.lower()
|
|
753
|
+
|
|
720
754
|
@field_validator("DEEPEVAL_SDK_RETRY_PROVIDERS", mode="before")
|
|
721
755
|
@classmethod
|
|
722
756
|
def _coerce_to_list(cls, v):
|
deepeval/dataset/api.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
|
2
2
|
from typing import Optional, List
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
@@ -11,6 +11,17 @@ class APIDataset(BaseModel):
|
|
|
11
11
|
None, alias="conversationalGoldens"
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
@model_validator(mode="after")
|
|
15
|
+
def set_image_mappings_for_goldens(self):
|
|
16
|
+
if self.goldens:
|
|
17
|
+
for golden in self.goldens:
|
|
18
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
19
|
+
if self.conversational_goldens:
|
|
20
|
+
for golden in self.conversational_goldens:
|
|
21
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
22
|
+
|
|
23
|
+
return self
|
|
24
|
+
|
|
14
25
|
|
|
15
26
|
class APIQueueDataset(BaseModel):
|
|
16
27
|
alias: str
|
|
@@ -19,6 +30,17 @@ class APIQueueDataset(BaseModel):
|
|
|
19
30
|
None, alias="conversationalGoldens"
|
|
20
31
|
)
|
|
21
32
|
|
|
33
|
+
@model_validator(mode="after")
|
|
34
|
+
def set_image_mappings_for_goldens(self):
|
|
35
|
+
if self.goldens:
|
|
36
|
+
for golden in self.goldens:
|
|
37
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
38
|
+
if self.conversational_goldens:
|
|
39
|
+
for golden in self.conversational_goldens:
|
|
40
|
+
golden.images_mapping = golden._get_images_mapping()
|
|
41
|
+
|
|
42
|
+
return self
|
|
43
|
+
|
|
22
44
|
|
|
23
45
|
class DatasetHttpResponse(BaseModel):
|
|
24
46
|
id: str
|
deepeval/dataset/golden.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from pydantic import BaseModel, Field, PrivateAttr, model_validator
|
|
2
3
|
from typing import Optional, Dict, List
|
|
3
4
|
from deepeval.test_case import ToolCall, Turn, MLLMImage
|
|
5
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class Golden(BaseModel):
|
|
@@ -33,6 +35,9 @@ class Golden(BaseModel):
|
|
|
33
35
|
default=None, serialization_alias="customColumnKeyValues"
|
|
34
36
|
)
|
|
35
37
|
multimodal: bool = Field(False, exclude=True)
|
|
38
|
+
images_mapping: Dict[str, MLLMImage] = Field(
|
|
39
|
+
default=None, alias="imagesMapping"
|
|
40
|
+
)
|
|
36
41
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
37
42
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
38
43
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
@@ -45,27 +50,60 @@ class Golden(BaseModel):
|
|
|
45
50
|
return self
|
|
46
51
|
|
|
47
52
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
48
|
-
|
|
53
|
+
auto_detect = (
|
|
49
54
|
any(
|
|
50
55
|
[
|
|
51
|
-
(
|
|
52
|
-
|
|
53
|
-
if self.input
|
|
54
|
-
else False
|
|
55
|
-
),
|
|
56
|
-
(
|
|
57
|
-
re.search(pattern, self.actual_output) is not None
|
|
58
|
-
if self.actual_output
|
|
59
|
-
else False
|
|
60
|
-
),
|
|
56
|
+
re.search(pattern, self.input or "") is not None,
|
|
57
|
+
re.search(pattern, self.actual_output or "") is not None,
|
|
61
58
|
]
|
|
62
59
|
)
|
|
63
60
|
if isinstance(self.input, str)
|
|
64
61
|
else self.multimodal
|
|
65
62
|
)
|
|
63
|
+
if self.retrieval_context is not None:
|
|
64
|
+
auto_detect = auto_detect or any(
|
|
65
|
+
re.search(pattern, context) is not None
|
|
66
|
+
for context in self.retrieval_context
|
|
67
|
+
)
|
|
68
|
+
if self.context is not None:
|
|
69
|
+
auto_detect = auto_detect or any(
|
|
70
|
+
re.search(pattern, context) is not None
|
|
71
|
+
for context in self.context
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self.multimodal = auto_detect
|
|
66
75
|
|
|
67
76
|
return self
|
|
68
77
|
|
|
78
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
79
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
80
|
+
image_ids = set()
|
|
81
|
+
|
|
82
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
83
|
+
"""Helper to extract image IDs from a string."""
|
|
84
|
+
if s is not None and isinstance(s, str):
|
|
85
|
+
matches = re.findall(pattern, s)
|
|
86
|
+
image_ids.update(matches)
|
|
87
|
+
|
|
88
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
89
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
90
|
+
if lst is not None:
|
|
91
|
+
for item in lst:
|
|
92
|
+
extract_ids_from_string(item)
|
|
93
|
+
|
|
94
|
+
extract_ids_from_string(self.input)
|
|
95
|
+
extract_ids_from_string(self.actual_output)
|
|
96
|
+
extract_ids_from_string(self.expected_output)
|
|
97
|
+
extract_ids_from_list(self.context)
|
|
98
|
+
extract_ids_from_list(self.retrieval_context)
|
|
99
|
+
|
|
100
|
+
images_mapping = {}
|
|
101
|
+
for img_id in image_ids:
|
|
102
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
103
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
104
|
+
|
|
105
|
+
return images_mapping if len(images_mapping) > 0 else None
|
|
106
|
+
|
|
69
107
|
|
|
70
108
|
class ConversationalGolden(BaseModel):
|
|
71
109
|
scenario: str
|
|
@@ -86,6 +124,9 @@ class ConversationalGolden(BaseModel):
|
|
|
86
124
|
)
|
|
87
125
|
turns: Optional[List[Turn]] = Field(default=None)
|
|
88
126
|
multimodal: bool = Field(False, exclude=True)
|
|
127
|
+
images_mapping: Dict[str, MLLMImage] = Field(
|
|
128
|
+
default=None, alias="imagesMapping"
|
|
129
|
+
)
|
|
89
130
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
90
131
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
91
132
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
@@ -98,15 +139,59 @@ class ConversationalGolden(BaseModel):
|
|
|
98
139
|
return self
|
|
99
140
|
|
|
100
141
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
101
|
-
self.
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
142
|
+
if self.scenario:
|
|
143
|
+
if re.search(pattern, self.scenario) is not None:
|
|
144
|
+
self.multimodal = True
|
|
145
|
+
return self
|
|
146
|
+
if self.expected_outcome:
|
|
147
|
+
if re.search(pattern, self.expected_outcome) is not None:
|
|
148
|
+
self.multimodal = True
|
|
149
|
+
return self
|
|
150
|
+
if self.user_description:
|
|
151
|
+
if re.search(pattern, self.user_description) is not None:
|
|
152
|
+
self.multimodal = True
|
|
153
|
+
return self
|
|
154
|
+
if self.turns:
|
|
155
|
+
for turn in self.turns:
|
|
156
|
+
if re.search(pattern, turn.content) is not None:
|
|
157
|
+
self.multimodal = True
|
|
158
|
+
return self
|
|
159
|
+
if turn.retrieval_context is not None:
|
|
160
|
+
self.multimodal = any(
|
|
161
|
+
re.search(pattern, context) is not None
|
|
162
|
+
for context in turn.retrieval_context
|
|
163
|
+
)
|
|
111
164
|
|
|
112
165
|
return self
|
|
166
|
+
|
|
167
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
168
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
169
|
+
image_ids = set()
|
|
170
|
+
|
|
171
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
172
|
+
"""Helper to extract image IDs from a string."""
|
|
173
|
+
if s is not None and isinstance(s, str):
|
|
174
|
+
matches = re.findall(pattern, s)
|
|
175
|
+
image_ids.update(matches)
|
|
176
|
+
|
|
177
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
178
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
179
|
+
if lst is not None:
|
|
180
|
+
for item in lst:
|
|
181
|
+
extract_ids_from_string(item)
|
|
182
|
+
|
|
183
|
+
extract_ids_from_string(self.scenario)
|
|
184
|
+
extract_ids_from_string(self.expected_outcome)
|
|
185
|
+
extract_ids_from_list(self.context)
|
|
186
|
+
extract_ids_from_string(self.user_description)
|
|
187
|
+
if self.turns:
|
|
188
|
+
for turn in self.turns:
|
|
189
|
+
extract_ids_from_string(turn.content)
|
|
190
|
+
extract_ids_from_list(turn.retrieval_context)
|
|
191
|
+
|
|
192
|
+
images_mapping = {}
|
|
193
|
+
for img_id in image_ids:
|
|
194
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
195
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
196
|
+
|
|
197
|
+
return images_mapping if len(images_mapping) > 0 else None
|
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -46,7 +46,6 @@ from deepeval.telemetry import capture_evaluation_run
|
|
|
46
46
|
from deepeval.metrics import (
|
|
47
47
|
BaseMetric,
|
|
48
48
|
BaseConversationalMetric,
|
|
49
|
-
BaseMultimodalMetric,
|
|
50
49
|
)
|
|
51
50
|
from deepeval.metrics.indicator import (
|
|
52
51
|
format_metric_description,
|
|
@@ -75,7 +74,6 @@ def assert_test(
|
|
|
75
74
|
Union[
|
|
76
75
|
List[BaseMetric],
|
|
77
76
|
List[BaseConversationalMetric],
|
|
78
|
-
List[BaseMultimodalMetric],
|
|
79
77
|
]
|
|
80
78
|
] = None,
|
|
81
79
|
golden: Optional[Golden] = None,
|
|
@@ -190,7 +188,6 @@ def evaluate(
|
|
|
190
188
|
Union[
|
|
191
189
|
List[BaseMetric],
|
|
192
190
|
List[BaseConversationalMetric],
|
|
193
|
-
List[BaseMultimodalMetric],
|
|
194
191
|
]
|
|
195
192
|
] = None,
|
|
196
193
|
# Evals on Confident AI
|