deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/test_case/api.py
CHANGED
|
@@ -10,9 +10,9 @@ from deepeval.test_run.api import (
|
|
|
10
10
|
from deepeval.test_case import (
|
|
11
11
|
LLMTestCase,
|
|
12
12
|
ConversationalTestCase,
|
|
13
|
-
MLLMTestCase,
|
|
14
13
|
Turn,
|
|
15
14
|
)
|
|
15
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
16
16
|
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
17
|
|
|
18
18
|
|
|
@@ -29,10 +29,12 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def create_api_test_case(
|
|
32
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
32
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
33
33
|
trace: Optional[TraceApi] = None,
|
|
34
34
|
index: Optional[int] = None,
|
|
35
35
|
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
+
from deepeval.utils import convert_to_multi_modal_array
|
|
37
|
+
|
|
36
38
|
if isinstance(test_case, ConversationalTestCase):
|
|
37
39
|
order = (
|
|
38
40
|
test_case._dataset_rank
|
|
@@ -84,7 +86,7 @@ def create_api_test_case(
|
|
|
84
86
|
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
85
87
|
metrics_data = []
|
|
86
88
|
|
|
87
|
-
if isinstance(test_case, LLMTestCase):
|
|
89
|
+
if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
|
|
88
90
|
api_test_case = LLMApiTestCase(
|
|
89
91
|
name=name,
|
|
90
92
|
input=test_case.input,
|
|
@@ -106,15 +108,15 @@ def create_api_test_case(
|
|
|
106
108
|
comments=test_case.comments,
|
|
107
109
|
trace=trace,
|
|
108
110
|
)
|
|
109
|
-
elif isinstance(test_case,
|
|
111
|
+
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
110
112
|
api_test_case = LLMApiTestCase(
|
|
111
113
|
name=name,
|
|
112
|
-
input=
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
input=test_case.input,
|
|
115
|
+
actualOutput=test_case.actual_output,
|
|
116
|
+
expectedOutput=test_case.expected_output,
|
|
117
|
+
retrievalContext=test_case.retrieval_context,
|
|
118
|
+
context=test_case.context,
|
|
119
|
+
imagesMapping=_MLLM_IMAGE_REGISTRY,
|
|
118
120
|
toolsCalled=test_case.tools_called,
|
|
119
121
|
expectedTools=test_case.expected_tools,
|
|
120
122
|
tokenCost=test_case.token_cost,
|
|
@@ -9,7 +9,7 @@ from typing import List, Optional, Dict, Literal
|
|
|
9
9
|
from copy import deepcopy
|
|
10
10
|
from enum import Enum
|
|
11
11
|
|
|
12
|
-
from deepeval.test_case import ToolCall
|
|
12
|
+
from deepeval.test_case import ToolCall, MLLMImage
|
|
13
13
|
from deepeval.test_case.mcp import (
|
|
14
14
|
MCPServer,
|
|
15
15
|
MCPPromptCall,
|
|
@@ -156,11 +156,29 @@ class ConversationalTestCase(BaseModel):
|
|
|
156
156
|
comments: Optional[str] = Field(default=None)
|
|
157
157
|
tags: Optional[List[str]] = Field(default=None)
|
|
158
158
|
mcp_servers: Optional[List[MCPServer]] = Field(default=None)
|
|
159
|
+
multimodal: bool = False
|
|
159
160
|
|
|
160
161
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
161
162
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
162
163
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
163
164
|
|
|
165
|
+
@model_validator(mode="after")
|
|
166
|
+
def set_is_multimodal(self):
|
|
167
|
+
import re
|
|
168
|
+
|
|
169
|
+
if self.multimodal is True:
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
173
|
+
self.multimodal = any(
|
|
174
|
+
[
|
|
175
|
+
re.search(pattern, turn.content) is not None
|
|
176
|
+
for turn in self.turns
|
|
177
|
+
]
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return self
|
|
181
|
+
|
|
164
182
|
@model_validator(mode="before")
|
|
165
183
|
def validate_input(cls, data):
|
|
166
184
|
turns = data.get("turns")
|
|
@@ -9,7 +9,12 @@ from typing import List, Optional, Dict, Any
|
|
|
9
9
|
from enum import Enum
|
|
10
10
|
import json
|
|
11
11
|
import uuid
|
|
12
|
-
|
|
12
|
+
import re
|
|
13
|
+
import os
|
|
14
|
+
import mimetypes
|
|
15
|
+
import base64
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from urllib.parse import urlparse, unquote
|
|
13
18
|
from deepeval.utils import make_model_config
|
|
14
19
|
|
|
15
20
|
from deepeval.test_case.mcp import (
|
|
@@ -20,6 +25,128 @@ from deepeval.test_case.mcp import (
|
|
|
20
25
|
validate_mcp_servers,
|
|
21
26
|
)
|
|
22
27
|
|
|
28
|
+
_MLLM_IMAGE_REGISTRY: Dict[str, "MLLMImage"] = {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class MLLMImage:
|
|
33
|
+
dataBase64: Optional[str] = None
|
|
34
|
+
mimeType: Optional[str] = None
|
|
35
|
+
url: Optional[str] = None
|
|
36
|
+
local: Optional[bool] = None
|
|
37
|
+
filename: Optional[str] = None
|
|
38
|
+
_id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
|
39
|
+
|
|
40
|
+
def __post_init__(self):
|
|
41
|
+
|
|
42
|
+
if not self.url and not self.dataBase64:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if self.dataBase64 is not None:
|
|
48
|
+
if self.mimeType is None:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"mimeType must be provided when initializing from Base64 data."
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
is_local = self.is_local_path(self.url)
|
|
54
|
+
if self.local is not None:
|
|
55
|
+
assert self.local == is_local, "Local path mismatch"
|
|
56
|
+
else:
|
|
57
|
+
self.local = is_local
|
|
58
|
+
|
|
59
|
+
# compute filename, mime_type, and Base64 data
|
|
60
|
+
if self.local:
|
|
61
|
+
path = self.process_url(self.url)
|
|
62
|
+
self.filename = os.path.basename(path)
|
|
63
|
+
self.mimeType = (
|
|
64
|
+
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
65
|
+
)
|
|
66
|
+
with open(path, "rb") as f:
|
|
67
|
+
raw = f.read()
|
|
68
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
69
|
+
else:
|
|
70
|
+
self.filename = None
|
|
71
|
+
self.mimeType = None
|
|
72
|
+
self.dataBase64 = None
|
|
73
|
+
|
|
74
|
+
_MLLM_IMAGE_REGISTRY[self._id] = self
|
|
75
|
+
|
|
76
|
+
def _placeholder(self) -> str:
|
|
77
|
+
return f"[DEEPEVAL:IMAGE:{self._id}]"
|
|
78
|
+
|
|
79
|
+
def __str__(self) -> str:
|
|
80
|
+
return self._placeholder()
|
|
81
|
+
|
|
82
|
+
def __repr__(self) -> str:
|
|
83
|
+
return self._placeholder()
|
|
84
|
+
|
|
85
|
+
def __format__(self, format_spec: str) -> str:
|
|
86
|
+
return self._placeholder()
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def process_url(url: str) -> str:
|
|
90
|
+
if os.path.exists(url):
|
|
91
|
+
return url
|
|
92
|
+
parsed = urlparse(url)
|
|
93
|
+
if parsed.scheme == "file":
|
|
94
|
+
raw_path = (
|
|
95
|
+
f"//{parsed.netloc}{parsed.path}"
|
|
96
|
+
if parsed.netloc
|
|
97
|
+
else parsed.path
|
|
98
|
+
)
|
|
99
|
+
path = unquote(raw_path)
|
|
100
|
+
return path
|
|
101
|
+
return url
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def is_local_path(url: str) -> bool:
|
|
105
|
+
if os.path.exists(url):
|
|
106
|
+
return True
|
|
107
|
+
parsed = urlparse(url)
|
|
108
|
+
if parsed.scheme == "file":
|
|
109
|
+
raw_path = (
|
|
110
|
+
f"//{parsed.netloc}{parsed.path}"
|
|
111
|
+
if parsed.netloc
|
|
112
|
+
else parsed.path
|
|
113
|
+
)
|
|
114
|
+
path = unquote(raw_path)
|
|
115
|
+
return os.path.exists(path)
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def parse_multimodal_string(s: str):
|
|
119
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
120
|
+
matches = list(re.finditer(pattern, s))
|
|
121
|
+
|
|
122
|
+
result = []
|
|
123
|
+
last_end = 0
|
|
124
|
+
|
|
125
|
+
for m in matches:
|
|
126
|
+
start, end = m.span()
|
|
127
|
+
|
|
128
|
+
if start > last_end:
|
|
129
|
+
result.append(s[last_end:start])
|
|
130
|
+
|
|
131
|
+
img_id = m.group(1)
|
|
132
|
+
|
|
133
|
+
if img_id not in _MLLM_IMAGE_REGISTRY:
|
|
134
|
+
MLLMImage(url=img_id, _id=img_id)
|
|
135
|
+
|
|
136
|
+
result.append(_MLLM_IMAGE_REGISTRY[img_id])
|
|
137
|
+
last_end = end
|
|
138
|
+
|
|
139
|
+
if last_end < len(s):
|
|
140
|
+
result.append(s[last_end:])
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
def as_data_uri(self) -> Optional[str]:
|
|
145
|
+
"""Return the image as a data URI string, if Base64 data is available."""
|
|
146
|
+
if not self.dataBase64 or not self.mimeType:
|
|
147
|
+
return None
|
|
148
|
+
return f"data:{self.mimeType};base64,{self.dataBase64}"
|
|
149
|
+
|
|
23
150
|
|
|
24
151
|
class LLMTestCaseParams(Enum):
|
|
25
152
|
INPUT = "input"
|
|
@@ -208,6 +335,7 @@ class LLMTestCase(BaseModel):
|
|
|
208
335
|
serialization_alias="completionTime",
|
|
209
336
|
validation_alias=AliasChoices("completionTime", "completion_time"),
|
|
210
337
|
)
|
|
338
|
+
multimodal: bool = Field(default=False)
|
|
211
339
|
name: Optional[str] = Field(default=None)
|
|
212
340
|
tags: Optional[List[str]] = Field(default=None)
|
|
213
341
|
mcp_servers: Optional[List[MCPServer]] = Field(default=None)
|
|
@@ -229,6 +357,29 @@ class LLMTestCase(BaseModel):
|
|
|
229
357
|
default_factory=lambda: str(uuid.uuid4())
|
|
230
358
|
)
|
|
231
359
|
|
|
360
|
+
@model_validator(mode="after")
|
|
361
|
+
def set_is_multimodal(self):
|
|
362
|
+
import re
|
|
363
|
+
|
|
364
|
+
if self.multimodal is True:
|
|
365
|
+
return self
|
|
366
|
+
|
|
367
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
368
|
+
|
|
369
|
+
auto_detect = (
|
|
370
|
+
any(
|
|
371
|
+
[
|
|
372
|
+
re.search(pattern, self.input or "") is not None,
|
|
373
|
+
re.search(pattern, self.actual_output or "") is not None,
|
|
374
|
+
]
|
|
375
|
+
)
|
|
376
|
+
if isinstance(self.input, str)
|
|
377
|
+
else self.multimodal
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
self.multimodal = auto_detect
|
|
381
|
+
return self
|
|
382
|
+
|
|
232
383
|
@model_validator(mode="before")
|
|
233
384
|
def validate_input(cls, data):
|
|
234
385
|
input = data.get("input")
|
deepeval/test_case/utils.py
CHANGED
|
@@ -1,24 +1,20 @@
|
|
|
1
1
|
from typing import Union, List
|
|
2
2
|
|
|
3
|
-
from deepeval.test_case import LLMTestCase,
|
|
3
|
+
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def check_valid_test_cases_type(
|
|
7
|
-
test_cases: Union[
|
|
8
|
-
List[Union[LLMTestCase, MLLMTestCase]], List[ConversationalTestCase]
|
|
9
|
-
],
|
|
7
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
10
8
|
):
|
|
11
9
|
llm_test_case_count = 0
|
|
12
10
|
conversational_test_case_count = 0
|
|
13
11
|
for test_case in test_cases:
|
|
14
|
-
if isinstance(test_case, LLMTestCase)
|
|
15
|
-
test_case, MLLMTestCase
|
|
16
|
-
):
|
|
12
|
+
if isinstance(test_case, LLMTestCase):
|
|
17
13
|
llm_test_case_count += 1
|
|
18
14
|
else:
|
|
19
15
|
conversational_test_case_count += 1
|
|
20
16
|
|
|
21
17
|
if llm_test_case_count > 0 and conversational_test_case_count > 0:
|
|
22
18
|
raise ValueError(
|
|
23
|
-
"You cannot supply a mixture of `LLMTestCase
|
|
19
|
+
"You cannot supply a mixture of `LLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
|
|
24
20
|
)
|
deepeval/test_run/api.py
CHANGED
|
@@ -18,20 +18,21 @@ class LLMApiTestCase(BaseModel):
|
|
|
18
18
|
token_cost: Optional[float] = Field(None, alias="tokenCost")
|
|
19
19
|
completion_time: Optional[float] = Field(None, alias="completionTime")
|
|
20
20
|
tags: Optional[List[str]] = Field(None)
|
|
21
|
-
multimodal_input: Optional[
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
21
|
+
# multimodal_input: Optional[str] = Field(None, alias="multimodalInput")
|
|
22
|
+
# multimodal_input_actual_output: Optional[str] = Field(
|
|
23
|
+
# None, alias="multimodalActualOutput"
|
|
24
|
+
# )
|
|
25
|
+
# multimodal_expected_output: Optional[str] = Field(
|
|
26
|
+
# None, alias="multimodalExpectedOutput"
|
|
27
|
+
# )
|
|
28
|
+
# multimodal_retrieval_context: Optional[List[str]] = Field(
|
|
29
|
+
# None, alias="multimodalRetrievalContext"
|
|
30
|
+
# )
|
|
31
|
+
# multimodal_context: Optional[List[str]] = Field(
|
|
32
|
+
# None, alias="multimodalContext"
|
|
33
|
+
# )
|
|
34
|
+
images_mapping: Optional[Dict[str, MLLMImage]] = Field(
|
|
35
|
+
None, alias="imagesMapping"
|
|
35
36
|
)
|
|
36
37
|
|
|
37
38
|
# make these optional, not all test cases in a conversation will be evaluated
|
deepeval/test_run/cache.py
CHANGED
|
@@ -90,6 +90,8 @@ class CachedTestRun(BaseModel):
|
|
|
90
90
|
# Pydantic version below 2.0
|
|
91
91
|
body = self.dict(by_alias=True, exclude_none=True)
|
|
92
92
|
json.dump(body, f, cls=CustomEncoder)
|
|
93
|
+
f.flush()
|
|
94
|
+
os.fsync(f.fileno())
|
|
93
95
|
return self
|
|
94
96
|
|
|
95
97
|
# load from file (this happens initially during a test run)
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -21,7 +21,7 @@ from deepeval.test_run.api import (
|
|
|
21
21
|
)
|
|
22
22
|
from deepeval.tracing.utils import make_json_serializable
|
|
23
23
|
from deepeval.tracing.api import SpanApiType, span_api_type_literals
|
|
24
|
-
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
24
|
+
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
25
25
|
from deepeval.utils import (
|
|
26
26
|
delete_file_if_exists,
|
|
27
27
|
get_is_running_deepeval,
|
|
@@ -182,7 +182,7 @@ class TestRun(BaseModel):
|
|
|
182
182
|
|
|
183
183
|
def set_dataset_properties(
|
|
184
184
|
self,
|
|
185
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
185
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
186
186
|
):
|
|
187
187
|
if self.dataset_alias is None:
|
|
188
188
|
self.dataset_alias = test_case._dataset_alias
|
|
@@ -406,9 +406,10 @@ class TestRun(BaseModel):
|
|
|
406
406
|
try:
|
|
407
407
|
body = self.model_dump(by_alias=True, exclude_none=True)
|
|
408
408
|
except AttributeError:
|
|
409
|
-
# Pydantic version below 2.0
|
|
410
409
|
body = self.dict(by_alias=True, exclude_none=True)
|
|
411
410
|
json.dump(body, f, cls=TestRunEncoder)
|
|
411
|
+
f.flush()
|
|
412
|
+
os.fsync(f.fileno())
|
|
412
413
|
return self
|
|
413
414
|
|
|
414
415
|
@classmethod
|
|
@@ -515,6 +516,8 @@ class TestRunManager:
|
|
|
515
516
|
)
|
|
516
517
|
wrapper_data = {save_under_key: test_run_data}
|
|
517
518
|
json.dump(wrapper_data, file, cls=TestRunEncoder)
|
|
519
|
+
file.flush()
|
|
520
|
+
os.fsync(file.fileno())
|
|
518
521
|
else:
|
|
519
522
|
self.test_run.save(file)
|
|
520
523
|
except portalocker.exceptions.LockException:
|
|
@@ -527,13 +530,15 @@ class TestRunManager:
|
|
|
527
530
|
LATEST_TEST_RUN_FILE_PATH, mode="w"
|
|
528
531
|
) as file:
|
|
529
532
|
json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
|
|
533
|
+
file.flush()
|
|
534
|
+
os.fsync(file.fileno())
|
|
530
535
|
except portalocker.exceptions.LockException:
|
|
531
536
|
pass
|
|
532
537
|
|
|
533
538
|
def update_test_run(
|
|
534
539
|
self,
|
|
535
540
|
api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase],
|
|
536
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
541
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
537
542
|
):
|
|
538
543
|
if (
|
|
539
544
|
api_test_case.metrics_data is not None
|
deepeval/tracing/patchers.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
4
5
|
from openai import OpenAI
|
|
5
6
|
|
|
6
7
|
from deepeval.tracing.context import update_current_span, update_llm_span
|
|
@@ -8,6 +9,10 @@ from deepeval.tracing.context import current_span_context
|
|
|
8
9
|
from deepeval.tracing.types import LlmSpan
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from anthropic import Anthropic
|
|
14
|
+
|
|
15
|
+
|
|
11
16
|
def patch_openai_client(client: OpenAI):
|
|
12
17
|
|
|
13
18
|
original_methods = {}
|
|
@@ -61,7 +66,7 @@ def patch_openai_client(client: OpenAI):
|
|
|
61
66
|
output = None
|
|
62
67
|
try:
|
|
63
68
|
output = response.choices[0].message.content
|
|
64
|
-
except Exception
|
|
69
|
+
except Exception:
|
|
65
70
|
pass
|
|
66
71
|
|
|
67
72
|
# extract input output token counts
|
|
@@ -70,7 +75,7 @@ def patch_openai_client(client: OpenAI):
|
|
|
70
75
|
try:
|
|
71
76
|
input_token_count = response.usage.prompt_tokens
|
|
72
77
|
output_token_count = response.usage.completion_tokens
|
|
73
|
-
except Exception
|
|
78
|
+
except Exception:
|
|
74
79
|
pass
|
|
75
80
|
|
|
76
81
|
update_current_span(
|
|
@@ -86,7 +91,7 @@ def patch_openai_client(client: OpenAI):
|
|
|
86
91
|
setattr(current_obj, method_name, wrapped_method)
|
|
87
92
|
|
|
88
93
|
|
|
89
|
-
def patch_anthropic_client(client: Anthropic):
|
|
94
|
+
def patch_anthropic_client(client: "Anthropic"):
|
|
90
95
|
"""
|
|
91
96
|
Patch an Anthropic client instance to add tracing capabilities.
|
|
92
97
|
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -19,7 +19,6 @@ import random
|
|
|
19
19
|
import atexit
|
|
20
20
|
import queue
|
|
21
21
|
import uuid
|
|
22
|
-
from anthropic import Anthropic
|
|
23
22
|
from openai import OpenAI
|
|
24
23
|
from rich.console import Console
|
|
25
24
|
from rich.progress import Progress
|
|
@@ -74,6 +73,7 @@ from deepeval.tracing.trace_test_manager import trace_testing_manager
|
|
|
74
73
|
|
|
75
74
|
if TYPE_CHECKING:
|
|
76
75
|
from deepeval.dataset.golden import Golden
|
|
76
|
+
from anthropic import Anthropic
|
|
77
77
|
|
|
78
78
|
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
79
79
|
|
|
@@ -154,7 +154,7 @@ class TraceManager:
|
|
|
154
154
|
environment: Optional[str] = None,
|
|
155
155
|
sampling_rate: Optional[float] = None,
|
|
156
156
|
confident_api_key: Optional[str] = None,
|
|
157
|
-
anthropic_client: Optional[Anthropic] = None,
|
|
157
|
+
anthropic_client: Optional["Anthropic"] = None,
|
|
158
158
|
openai_client: Optional[OpenAI] = None,
|
|
159
159
|
tracing_enabled: Optional[bool] = None,
|
|
160
160
|
) -> None:
|
deepeval/utils.py
CHANGED
|
@@ -14,6 +14,7 @@ import logging
|
|
|
14
14
|
|
|
15
15
|
from contextvars import ContextVar
|
|
16
16
|
from enum import Enum
|
|
17
|
+
from importlib import import_module
|
|
17
18
|
from typing import Any, Dict, List, Optional, Protocol, Sequence, Union
|
|
18
19
|
from collections.abc import Iterable
|
|
19
20
|
from dataclasses import asdict, is_dataclass
|
|
@@ -21,6 +22,7 @@ from pydantic import BaseModel
|
|
|
21
22
|
from rich.progress import Progress
|
|
22
23
|
from rich.console import Console, Theme
|
|
23
24
|
|
|
25
|
+
from deepeval.errors import DeepEvalError
|
|
24
26
|
from deepeval.config.settings import get_settings
|
|
25
27
|
from deepeval.config.utils import (
|
|
26
28
|
get_env_bool,
|
|
@@ -536,6 +538,25 @@ def shorten(
|
|
|
536
538
|
return stext[:cut] + suffix
|
|
537
539
|
|
|
538
540
|
|
|
541
|
+
def convert_to_multi_modal_array(input: Union[str, List[str]]):
|
|
542
|
+
from deepeval.test_case import MLLMImage
|
|
543
|
+
|
|
544
|
+
if isinstance(input, str):
|
|
545
|
+
return MLLMImage.parse_multimodal_string(input)
|
|
546
|
+
elif isinstance(input, list):
|
|
547
|
+
new_list = []
|
|
548
|
+
for context in input:
|
|
549
|
+
parsed_array = MLLMImage.parse_multimodal_string(context)
|
|
550
|
+
new_list.extend(parsed_array)
|
|
551
|
+
return new_list
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def check_if_multimodal(input: str):
|
|
555
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
556
|
+
matches = list(re.finditer(pattern, input))
|
|
557
|
+
return bool(matches)
|
|
558
|
+
|
|
559
|
+
|
|
539
560
|
def format_turn(
|
|
540
561
|
turn: TurnLike,
|
|
541
562
|
*,
|
|
@@ -814,3 +835,71 @@ def format_error_text(
|
|
|
814
835
|
|
|
815
836
|
def is_read_only_env():
|
|
816
837
|
return get_settings().DEEPEVAL_FILE_SYSTEM == "READ_ONLY"
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
##############
|
|
841
|
+
# validation #
|
|
842
|
+
##############
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
def require_param(
|
|
846
|
+
param: Optional[Any] = None,
|
|
847
|
+
*,
|
|
848
|
+
provider_label: str,
|
|
849
|
+
env_var_name: str,
|
|
850
|
+
param_hint: str,
|
|
851
|
+
) -> Any:
|
|
852
|
+
"""
|
|
853
|
+
Ensures that a required parameter is provided. If the parameter is `None`, raises a
|
|
854
|
+
`DeepEvalError` with a helpful message indicating the missing parameter and how to resolve it.
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
param (Optional[Any]): The parameter to validate.
|
|
858
|
+
provider_label (str): A label for the provider to be used in the error message.
|
|
859
|
+
env_var_name (str): The name of the environment variable where the parameter can be set.
|
|
860
|
+
param_hint (str): A hint for the parameter, usually the name of the argument.
|
|
861
|
+
|
|
862
|
+
Raises:
|
|
863
|
+
DeepEvalError: If the `param` is `None`, indicating that a required parameter is missing.
|
|
864
|
+
|
|
865
|
+
Returns:
|
|
866
|
+
Any: The value of `param` if it is provided.
|
|
867
|
+
"""
|
|
868
|
+
if param is None:
|
|
869
|
+
raise DeepEvalError(
|
|
870
|
+
f"{provider_label} is missing a required parameter. "
|
|
871
|
+
f"Set {env_var_name} in your environment or pass "
|
|
872
|
+
f"{param_hint}."
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return param
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def require_dependency(
|
|
879
|
+
module_name: str,
|
|
880
|
+
*,
|
|
881
|
+
provider_label: str,
|
|
882
|
+
install_hint: Optional[str] = None,
|
|
883
|
+
) -> Any:
|
|
884
|
+
"""
|
|
885
|
+
Imports an optional dependency module or raises a `DeepEvalError` if the module is not found.
|
|
886
|
+
The error message includes a suggestion on how to install the missing module.
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
module_name (str): The name of the module to import.
|
|
890
|
+
provider_label (str): A label for the provider to be used in the error message.
|
|
891
|
+
install_hint (Optional[str]): A hint on how to install the missing module, usually a pip command.
|
|
892
|
+
|
|
893
|
+
Raises:
|
|
894
|
+
DeepEvalError: If the module cannot be imported, indicating that the dependency is missing.
|
|
895
|
+
|
|
896
|
+
Returns:
|
|
897
|
+
Any: The imported module if successful.
|
|
898
|
+
"""
|
|
899
|
+
try:
|
|
900
|
+
return import_module(module_name)
|
|
901
|
+
except ImportError as exc:
|
|
902
|
+
hint = install_hint or f"Install it with `pip install {module_name}`."
|
|
903
|
+
raise DeepEvalError(
|
|
904
|
+
f"{provider_label} requires the `{module_name}` package. {hint}"
|
|
905
|
+
) from exc
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.7.
|
|
3
|
+
Version: 3.7.5
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -13,13 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
|
-
Requires-Dist: anthropic
|
|
17
16
|
Requires-Dist: click (>=8.0.0,<8.3.0)
|
|
18
|
-
Requires-Dist: google-genai (>=1.9.0,<2.0.0)
|
|
19
17
|
Requires-Dist: grpcio (>=1.67.1,<2.0.0)
|
|
20
18
|
Requires-Dist: jinja2
|
|
21
19
|
Requires-Dist: nest_asyncio
|
|
22
|
-
Requires-Dist: ollama
|
|
23
20
|
Requires-Dist: openai
|
|
24
21
|
Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
|
|
25
22
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
|