deepeval 3.6.9__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +154 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/RECORD +47 -37
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
5
|
+
from deepeval.metrics.utils import (
|
|
6
|
+
check_llm_test_case_params,
|
|
7
|
+
construct_verbose_logs,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.metrics.api import metric_data_manager
|
|
10
|
+
from deepeval.metrics import BaseMetric
|
|
11
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PatternMatchMetric(BaseMetric):
|
|
15
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
16
|
+
LLMTestCaseParams.INPUT,
|
|
17
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pattern: str,
|
|
23
|
+
ignore_case: bool = False,
|
|
24
|
+
threshold: float = 1.0,
|
|
25
|
+
verbose_mode: bool = False,
|
|
26
|
+
):
|
|
27
|
+
self.pattern = pattern.strip()
|
|
28
|
+
self.ignore_case = ignore_case
|
|
29
|
+
self.verbose_mode = verbose_mode
|
|
30
|
+
self.threshold = threshold
|
|
31
|
+
|
|
32
|
+
flags = re.IGNORECASE if ignore_case else 0
|
|
33
|
+
try:
|
|
34
|
+
self._compiled_pattern = re.compile(self.pattern, flags)
|
|
35
|
+
except re.error as e:
|
|
36
|
+
raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
|
|
37
|
+
|
|
38
|
+
def measure(
|
|
39
|
+
self,
|
|
40
|
+
test_case: LLMTestCase,
|
|
41
|
+
_show_indicator: bool = True,
|
|
42
|
+
_in_component: bool = False,
|
|
43
|
+
_log_metric_to_confident: bool = True,
|
|
44
|
+
) -> float:
|
|
45
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
46
|
+
|
|
47
|
+
with metric_progress_indicator(
|
|
48
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
49
|
+
):
|
|
50
|
+
actual = test_case.actual_output.strip()
|
|
51
|
+
full_match = self._compiled_pattern.fullmatch(actual)
|
|
52
|
+
|
|
53
|
+
self.score = 1.0 if full_match else 0.0
|
|
54
|
+
self.reason = (
|
|
55
|
+
f"The actual output fully matches the pattern."
|
|
56
|
+
if full_match
|
|
57
|
+
else f"The actual output does not match the pattern."
|
|
58
|
+
)
|
|
59
|
+
self.success = self.score >= self.threshold
|
|
60
|
+
|
|
61
|
+
if self.verbose_mode:
|
|
62
|
+
self.verbose_logs = construct_verbose_logs(
|
|
63
|
+
self,
|
|
64
|
+
steps=[
|
|
65
|
+
f"Pattern: {self.pattern}",
|
|
66
|
+
f"Actual: {actual}",
|
|
67
|
+
f"Score: {self.score:.2f}",
|
|
68
|
+
f"Reason: {self.reason}",
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if _log_metric_to_confident:
|
|
73
|
+
metric_data_manager.post_metric_if_enabled(
|
|
74
|
+
self, test_case=test_case
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return self.score
|
|
78
|
+
|
|
79
|
+
async def a_measure(
|
|
80
|
+
self,
|
|
81
|
+
test_case: LLMTestCase,
|
|
82
|
+
_show_indicator: bool = True,
|
|
83
|
+
_in_component: bool = False,
|
|
84
|
+
) -> float:
|
|
85
|
+
return self.measure(
|
|
86
|
+
test_case,
|
|
87
|
+
_show_indicator=_show_indicator,
|
|
88
|
+
_in_component=_in_component,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def is_successful(self) -> bool:
|
|
92
|
+
if self.error is not None:
|
|
93
|
+
self.success = False
|
|
94
|
+
else:
|
|
95
|
+
try:
|
|
96
|
+
self.success = self.score >= self.threshold
|
|
97
|
+
except:
|
|
98
|
+
self.success = False
|
|
99
|
+
return self.success
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def __name__(self):
|
|
103
|
+
return "Pattern Match"
|
|
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
36
36
|
strict_mode: bool = False,
|
|
37
37
|
verbose_mode: bool = False,
|
|
38
38
|
):
|
|
39
|
+
if task is None:
|
|
40
|
+
self._is_task_provided = False
|
|
41
|
+
else:
|
|
42
|
+
self._is_task_provided = True
|
|
43
|
+
|
|
39
44
|
self.task = task
|
|
40
45
|
self.threshold = 1 if strict_mode else threshold
|
|
41
46
|
self.model, self.using_native_model = initialize_model(model)
|
|
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
73
78
|
)
|
|
74
79
|
else:
|
|
75
80
|
task, self.outcome = self._extract_task_and_outcome(test_case)
|
|
76
|
-
|
|
81
|
+
if self.task is None or not self._is_task_provided:
|
|
82
|
+
self.task = task
|
|
77
83
|
self.verdict, self.reason = self._generate_verdicts()
|
|
78
84
|
self.score = self._calculate_score()
|
|
79
85
|
self.success = self.score >= self.threshold
|
|
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
108
114
|
task, self.outcome = await self._a_extract_task_and_outcome(
|
|
109
115
|
test_case
|
|
110
116
|
)
|
|
111
|
-
|
|
117
|
+
if self.task is None or not self._is_task_provided:
|
|
118
|
+
self.task = task
|
|
112
119
|
self.verdict, self.reason = await self._a_generate_verdicts()
|
|
113
120
|
self.score = self._calculate_score()
|
|
114
121
|
self.success = self.score >= self.threshold
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Any, List, Optional
|
|
4
|
+
|
|
5
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
from deepeval.tracing.context import (
|
|
8
|
+
current_span_context,
|
|
9
|
+
current_trace_context,
|
|
10
|
+
update_current_span,
|
|
11
|
+
update_llm_span,
|
|
12
|
+
)
|
|
13
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
14
|
+
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
15
|
+
from deepeval.utils import shorten, len_long
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _update_all_attributes(
|
|
19
|
+
input_parameters: InputParameters,
|
|
20
|
+
output_parameters: OutputParameters,
|
|
21
|
+
expected_tools: List[ToolCall],
|
|
22
|
+
expected_output: str,
|
|
23
|
+
context: List[str],
|
|
24
|
+
retrieval_context: List[str],
|
|
25
|
+
):
|
|
26
|
+
"""Update span and trace attributes with input/output parameters."""
|
|
27
|
+
update_current_span(
|
|
28
|
+
input=input_parameters.input or input_parameters.messages or "NA",
|
|
29
|
+
output=output_parameters.output or "NA",
|
|
30
|
+
tools_called=output_parameters.tools_called,
|
|
31
|
+
# attributes to be added
|
|
32
|
+
expected_output=expected_output,
|
|
33
|
+
expected_tools=expected_tools,
|
|
34
|
+
context=context,
|
|
35
|
+
retrieval_context=retrieval_context,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
llm_context = current_llm_context.get()
|
|
39
|
+
|
|
40
|
+
update_llm_span(
|
|
41
|
+
input_token_count=output_parameters.prompt_tokens,
|
|
42
|
+
output_token_count=output_parameters.completion_tokens,
|
|
43
|
+
prompt=llm_context.prompt,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if output_parameters.tools_called:
|
|
47
|
+
create_child_tool_spans(output_parameters)
|
|
48
|
+
|
|
49
|
+
__update_input_and_output_of_current_trace(
|
|
50
|
+
input_parameters, output_parameters
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def __update_input_and_output_of_current_trace(
|
|
55
|
+
input_parameters: InputParameters, output_parameters: OutputParameters
|
|
56
|
+
):
|
|
57
|
+
|
|
58
|
+
current_trace = current_trace_context.get()
|
|
59
|
+
if current_trace:
|
|
60
|
+
if current_trace.input is None:
|
|
61
|
+
current_trace.input = (
|
|
62
|
+
input_parameters.input or input_parameters.messages
|
|
63
|
+
)
|
|
64
|
+
if current_trace.output is None:
|
|
65
|
+
current_trace.output = output_parameters.output
|
|
66
|
+
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
71
|
+
if output_parameters.tools_called is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
current_span = current_span_context.get()
|
|
75
|
+
for tool_called in output_parameters.tools_called:
|
|
76
|
+
tool_span = ToolSpan(
|
|
77
|
+
**{
|
|
78
|
+
"uuid": str(uuid.uuid4()),
|
|
79
|
+
"trace_uuid": current_span.trace_uuid,
|
|
80
|
+
"parent_uuid": current_span.uuid,
|
|
81
|
+
"start_time": current_span.start_time,
|
|
82
|
+
"end_time": current_span.start_time,
|
|
83
|
+
"status": TraceSpanStatus.SUCCESS,
|
|
84
|
+
"children": [],
|
|
85
|
+
"name": tool_called.name,
|
|
86
|
+
"input": tool_called.input_parameters,
|
|
87
|
+
"output": None,
|
|
88
|
+
"metrics": None,
|
|
89
|
+
"description": tool_called.description,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
current_span.children.append(tool_span)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_URL_MAX = 200
|
|
96
|
+
_JSON_MAX = max(
|
|
97
|
+
len_long(), 400
|
|
98
|
+
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def compact_dump(value: Any) -> str:
|
|
102
|
+
try:
|
|
103
|
+
dumped = json.dumps(
|
|
104
|
+
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
105
|
+
)
|
|
106
|
+
except Exception:
|
|
107
|
+
dumped = repr(value)
|
|
108
|
+
return shorten(dumped, max_len=_JSON_MAX)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def fmt_url(url: Optional[str]) -> str:
|
|
112
|
+
if not url:
|
|
113
|
+
return ""
|
|
114
|
+
if url.startswith("data:"):
|
|
115
|
+
return "[data-uri]"
|
|
116
|
+
return shorten(url, max_len=_URL_MAX)
|
deepeval/models/base_model.py
CHANGED
|
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
|
|
|
68
68
|
Returns:
|
|
69
69
|
A list of strings.
|
|
70
70
|
"""
|
|
71
|
-
raise
|
|
71
|
+
raise NotImplementedError(
|
|
72
|
+
"batch_generate is not implemented for this model"
|
|
73
|
+
)
|
|
72
74
|
|
|
73
75
|
@abstractmethod
|
|
74
76
|
def get_model_name(self, *args, **kwargs) -> str:
|
deepeval/openai/__init__.py
CHANGED
|
@@ -15,5 +15,7 @@ except ImportError:
|
|
|
15
15
|
|
|
16
16
|
if OpenAI or AsyncOpenAI:
|
|
17
17
|
from deepeval.openai.patch import patch_openai_classes
|
|
18
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
with capture_tracing_integration("openai"):
|
|
21
|
+
patch_openai_classes()
|
deepeval/openai/extractors.py
CHANGED
|
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
|
|
|
4
4
|
from openai.types.responses import Response
|
|
5
5
|
|
|
6
6
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
|
|
8
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
7
9
|
from deepeval.openai.utils import (
|
|
8
10
|
render_response_input,
|
|
9
11
|
stringify_multimodal_content,
|
|
10
12
|
render_messages,
|
|
11
13
|
)
|
|
12
|
-
from deepeval.openai.types import InputParameters, OutputParameters
|
|
13
|
-
from deepeval.tracing.types import Message
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
# guarding against errors to be compatible with legacy APIs
|
deepeval/openai/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import Any, Dict, List,
|
|
3
|
+
from typing import Any, Dict, List, Iterable
|
|
4
4
|
|
|
5
5
|
from openai.types.chat.chat_completion_message_param import (
|
|
6
6
|
ChatCompletionMessageParam,
|
|
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
|
|
|
8
8
|
|
|
9
9
|
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
10
10
|
from deepeval.tracing.context import current_span_context
|
|
11
|
-
from deepeval.
|
|
12
|
-
from deepeval.
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
_URL_MAX = 200
|
|
16
|
-
_JSON_MAX = max(
|
|
17
|
-
len_long(), 400
|
|
18
|
-
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _compact_dump(value: Any) -> str:
|
|
22
|
-
try:
|
|
23
|
-
dumped = json.dumps(
|
|
24
|
-
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
25
|
-
)
|
|
26
|
-
except Exception:
|
|
27
|
-
dumped = repr(value)
|
|
28
|
-
return shorten(dumped, max_len=_JSON_MAX)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _fmt_url(url: Optional[str]) -> str:
|
|
32
|
-
if not url:
|
|
33
|
-
return ""
|
|
34
|
-
if url.startswith("data:"):
|
|
35
|
-
return "[data-uri]"
|
|
36
|
-
return shorten(url, max_len=_URL_MAX)
|
|
11
|
+
from deepeval.model_integrations.types import OutputParameters
|
|
12
|
+
from deepeval.model_integrations.utils import compact_dump, fmt_url
|
|
37
13
|
|
|
38
14
|
|
|
39
15
|
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
111
87
|
url = image_url
|
|
112
88
|
else:
|
|
113
89
|
url = (image_url or {}).get("url") or content.get("url")
|
|
114
|
-
return f"[image:{
|
|
90
|
+
return f"[image:{fmt_url(url)}]"
|
|
115
91
|
|
|
116
92
|
# Responses API variants
|
|
117
93
|
if t == "input_text":
|
|
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
122
98
|
url = image_url
|
|
123
99
|
else:
|
|
124
100
|
url = (image_url or {}).get("url") or content.get("url")
|
|
125
|
-
return f"[image:{
|
|
101
|
+
return f"[image:{fmt_url(url)}]"
|
|
126
102
|
|
|
127
103
|
# readability for other input_* types we don't currently handle
|
|
128
104
|
if t and t.startswith("input_"):
|
|
129
105
|
return f"[{t}]"
|
|
130
106
|
|
|
131
107
|
# unknown dicts and types returned as shortened JSON
|
|
132
|
-
return
|
|
108
|
+
return compact_dump(content)
|
|
133
109
|
|
|
134
110
|
|
|
135
111
|
def render_messages(
|
|
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
|
|
|
228
204
|
lines.append(f"{prefix}{key}:")
|
|
229
205
|
lines.append(_render_content(value, indent + 1))
|
|
230
206
|
elif isinstance(value, list):
|
|
231
|
-
lines.append(f"{prefix}{key}: {
|
|
207
|
+
lines.append(f"{prefix}{key}: {compact_dump(value)}")
|
|
232
208
|
else:
|
|
233
209
|
lines.append(f"{prefix}{key}: {value}")
|
|
234
210
|
|
deepeval/prompt/api.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field, AliasChoices
|
|
1
|
+
from pydantic import BaseModel, Field, AliasChoices, ConfigDict
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
from pydantic import TypeAdapter
|
|
5
5
|
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
7
|
+
|
|
6
8
|
###################################
|
|
7
9
|
# Model Settings
|
|
8
10
|
###################################
|
|
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
|
|
|
92
94
|
|
|
93
95
|
|
|
94
96
|
class OutputSchemaField(BaseModel):
|
|
97
|
+
model_config = make_model_config(use_enum_values=True)
|
|
98
|
+
|
|
95
99
|
id: str
|
|
96
100
|
type: SchemaDataType
|
|
97
101
|
name: str
|
|
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
|
|
|
102
106
|
validation_alias=AliasChoices("parent_id", "parentId"),
|
|
103
107
|
)
|
|
104
108
|
|
|
105
|
-
class Config:
|
|
106
|
-
use_enum_values = True
|
|
107
|
-
|
|
108
109
|
|
|
109
110
|
class OutputSchema(BaseModel):
|
|
110
111
|
fields: Optional[List[OutputSchemaField]] = None
|
|
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
|
|
|
187
188
|
|
|
188
189
|
|
|
189
190
|
class PromptPushRequest(BaseModel):
|
|
191
|
+
model_config = make_model_config(use_enum_values=True)
|
|
192
|
+
|
|
193
|
+
model_config = ConfigDict(use_enum_values=True)
|
|
194
|
+
|
|
190
195
|
alias: str
|
|
191
196
|
text: Optional[str] = None
|
|
192
197
|
messages: Optional[List[PromptMessage]] = None
|
|
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
|
|
|
203
208
|
default=None, serialization_alias="outputType"
|
|
204
209
|
)
|
|
205
210
|
|
|
206
|
-
class Config:
|
|
207
|
-
use_enum_values = True
|
|
208
|
-
|
|
209
211
|
|
|
210
212
|
class PromptUpdateRequest(BaseModel):
|
|
213
|
+
model_config = make_model_config(use_enum_values=True)
|
|
214
|
+
|
|
211
215
|
text: Optional[str] = None
|
|
212
216
|
messages: Optional[List[PromptMessage]] = None
|
|
213
217
|
interpolation_type: PromptInterpolationType = Field(
|
|
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
|
|
|
223
227
|
default=None, serialization_alias="outputType"
|
|
224
228
|
)
|
|
225
229
|
|
|
226
|
-
class Config:
|
|
227
|
-
use_enum_values = True
|
|
228
|
-
|
|
229
230
|
|
|
230
231
|
class PromptApi(BaseModel):
|
|
231
232
|
id: str
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -5,11 +5,13 @@ from rich.console import Console
|
|
|
5
5
|
import time
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
-
from pydantic import BaseModel, ValidationError
|
|
8
|
+
from pydantic import BaseModel, ValidationError, ConfigDict
|
|
9
9
|
import asyncio
|
|
10
10
|
import portalocker
|
|
11
11
|
import threading
|
|
12
12
|
|
|
13
|
+
from deepeval.utils import make_model_config
|
|
14
|
+
|
|
13
15
|
from deepeval.prompt.api import (
|
|
14
16
|
PromptHttpResponse,
|
|
15
17
|
PromptMessage,
|
|
@@ -77,6 +79,8 @@ class CustomEncoder(json.JSONEncoder):
|
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class CachedPrompt(BaseModel):
|
|
82
|
+
model_config = make_model_config(use_enum_values=True)
|
|
83
|
+
|
|
80
84
|
alias: str
|
|
81
85
|
version: str
|
|
82
86
|
label: Optional[str] = None
|
|
@@ -89,9 +93,6 @@ class CachedPrompt(BaseModel):
|
|
|
89
93
|
output_type: Optional[OutputType]
|
|
90
94
|
output_schema: Optional[OutputSchema]
|
|
91
95
|
|
|
92
|
-
class Config:
|
|
93
|
-
use_enum_values = True
|
|
94
|
-
|
|
95
96
|
|
|
96
97
|
class Prompt:
|
|
97
98
|
|
deepeval/telemetry.py
CHANGED
|
@@ -3,12 +3,12 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import socket
|
|
5
5
|
import sys
|
|
6
|
-
from threading import Event
|
|
7
6
|
import uuid
|
|
8
7
|
import sentry_sdk
|
|
9
8
|
from enum import Enum
|
|
10
9
|
from typing import List, Dict
|
|
11
10
|
import requests
|
|
11
|
+
from deepeval.config.settings import get_settings
|
|
12
12
|
from deepeval.constants import LOGIN_PROMPT, HIDDEN_DIR, KEY_FILE
|
|
13
13
|
from posthog import Posthog
|
|
14
14
|
|
|
@@ -34,7 +34,7 @@ TELEMETRY_PATH = os.path.join(HIDDEN_DIR, TELEMETRY_DATA_FILE)
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def telemetry_opt_out():
|
|
37
|
-
return
|
|
37
|
+
return get_settings().DEEPEVAL_TELEMETRY_OPT_OUT
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def blocked_by_firewall():
|
|
@@ -131,7 +131,7 @@ if not telemetry_opt_out():
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
if (
|
|
134
|
-
|
|
134
|
+
get_settings().ERROR_REPORTING
|
|
135
135
|
and not blocked_by_firewall()
|
|
136
136
|
and not telemetry_opt_out()
|
|
137
137
|
):
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from pydantic import (
|
|
2
|
-
ConfigDict,
|
|
3
2
|
Field,
|
|
4
3
|
BaseModel,
|
|
5
4
|
model_validator,
|
|
@@ -11,6 +10,8 @@ from enum import Enum
|
|
|
11
10
|
import json
|
|
12
11
|
import uuid
|
|
13
12
|
|
|
13
|
+
from deepeval.utils import make_model_config
|
|
14
|
+
|
|
14
15
|
from deepeval.test_case.mcp import (
|
|
15
16
|
MCPServer,
|
|
16
17
|
MCPPromptCall,
|
|
@@ -156,7 +157,7 @@ class ToolCall(BaseModel):
|
|
|
156
157
|
|
|
157
158
|
|
|
158
159
|
class LLMTestCase(BaseModel):
|
|
159
|
-
model_config =
|
|
160
|
+
model_config = make_model_config(extra="ignore")
|
|
160
161
|
|
|
161
162
|
input: str
|
|
162
163
|
actual_output: Optional[str] = Field(
|
deepeval/test_run/api.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
2
|
from typing import Optional, List, Union, Dict
|
|
3
3
|
|
|
4
4
|
from deepeval.test_case import MLLMImage, ToolCall
|
|
5
5
|
from deepeval.tracing.api import TraceApi, MetricData
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class LLMApiTestCase(BaseModel):
|
|
@@ -49,7 +50,7 @@ class LLMApiTestCase(BaseModel):
|
|
|
49
50
|
comments: Optional[str] = Field(None)
|
|
50
51
|
trace: Optional[TraceApi] = Field(None)
|
|
51
52
|
|
|
52
|
-
model_config =
|
|
53
|
+
model_config = make_model_config(arbitrary_types_allowed=True)
|
|
53
54
|
# metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
54
55
|
|
|
55
56
|
def update_metric_data(self, metric_data: MetricData):
|
deepeval/test_run/cache.py
CHANGED
|
@@ -6,6 +6,8 @@ from typing import List, Optional, Union, Dict, Union
|
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
|
+
from deepeval.utils import make_model_config
|
|
10
|
+
|
|
9
11
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
|
|
10
12
|
from deepeval.test_run.api import MetricData
|
|
11
13
|
from deepeval.utils import (
|
|
@@ -20,6 +22,8 @@ TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
|
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class MetricConfiguration(BaseModel):
|
|
25
|
+
model_config = make_model_config(arbitrary_types_allowed=True)
|
|
26
|
+
|
|
23
27
|
##### Required fields #####
|
|
24
28
|
threshold: float
|
|
25
29
|
evaluation_model: Optional[str] = None
|
|
@@ -36,9 +40,6 @@ class MetricConfiguration(BaseModel):
|
|
|
36
40
|
Union[List[LLMTestCaseParams], List[ToolCallParams]]
|
|
37
41
|
] = None
|
|
38
42
|
|
|
39
|
-
class Config:
|
|
40
|
-
arbitrary_types_allowed = True
|
|
41
|
-
|
|
42
43
|
|
|
43
44
|
class CachedMetricData(BaseModel):
|
|
44
45
|
metric_data: MetricData
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -463,19 +463,29 @@ class TestRunManager:
|
|
|
463
463
|
mode="r",
|
|
464
464
|
flags=portalocker.LOCK_SH | portalocker.LOCK_NB,
|
|
465
465
|
) as file:
|
|
466
|
-
|
|
466
|
+
loaded = self.test_run.load(file)
|
|
467
|
+
# only overwrite if loading actually worked
|
|
468
|
+
self.test_run = loaded
|
|
467
469
|
except (
|
|
468
470
|
FileNotFoundError,
|
|
471
|
+
json.JSONDecodeError,
|
|
469
472
|
portalocker.exceptions.LockException,
|
|
470
473
|
) as e:
|
|
471
|
-
print(
|
|
472
|
-
|
|
474
|
+
print(
|
|
475
|
+
f"Warning: Could not load test run from disk: {e}",
|
|
476
|
+
file=sys.stderr,
|
|
477
|
+
)
|
|
473
478
|
|
|
474
479
|
return self.test_run
|
|
475
480
|
|
|
476
481
|
def save_test_run(self, path: str, save_under_key: Optional[str] = None):
|
|
477
482
|
if self.save_to_disk:
|
|
478
483
|
try:
|
|
484
|
+
# ensure parent directory exists
|
|
485
|
+
parent = os.path.dirname(path)
|
|
486
|
+
if parent:
|
|
487
|
+
os.makedirs(parent, exist_ok=True)
|
|
488
|
+
|
|
479
489
|
with portalocker.Lock(path, mode="w") as file:
|
|
480
490
|
if save_under_key:
|
|
481
491
|
try:
|
|
@@ -533,10 +543,19 @@ class TestRunManager:
|
|
|
533
543
|
self.test_run.save(file)
|
|
534
544
|
except (
|
|
535
545
|
FileNotFoundError,
|
|
546
|
+
json.JSONDecodeError,
|
|
536
547
|
portalocker.exceptions.LockException,
|
|
537
548
|
) as e:
|
|
538
|
-
print(
|
|
539
|
-
|
|
549
|
+
print(
|
|
550
|
+
f"Warning: Could not update test run on disk: {e}",
|
|
551
|
+
file=sys.stderr,
|
|
552
|
+
)
|
|
553
|
+
if self.test_run is None:
|
|
554
|
+
# guarantee a valid in-memory run so the update can proceed.
|
|
555
|
+
# never destroy in-memory state on I/O failure.
|
|
556
|
+
self.create_test_run()
|
|
557
|
+
self.test_run.add_test_case(api_test_case)
|
|
558
|
+
self.test_run.set_dataset_properties(test_case)
|
|
540
559
|
else:
|
|
541
560
|
if self.test_run is None:
|
|
542
561
|
self.create_test_run()
|
deepeval/tracing/api.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Dict, List, Optional, Union, Literal, Any
|
|
3
|
-
from pydantic import BaseModel,
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
from deepeval.test_case import ToolCall
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class SpanApiType(Enum):
|
|
@@ -27,7 +28,7 @@ class PromptApi(BaseModel):
|
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class MetricData(BaseModel):
|
|
30
|
-
model_config =
|
|
31
|
+
model_config = make_model_config(extra="ignore")
|
|
31
32
|
|
|
32
33
|
name: str
|
|
33
34
|
threshold: float
|
|
@@ -42,6 +43,10 @@ class MetricData(BaseModel):
|
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
class BaseApiSpan(BaseModel):
|
|
46
|
+
model_config = make_model_config(
|
|
47
|
+
use_enum_values=True, validate_assignment=True
|
|
48
|
+
)
|
|
49
|
+
|
|
45
50
|
uuid: str
|
|
46
51
|
name: str = None
|
|
47
52
|
status: TraceSpanApiStatus
|
|
@@ -96,12 +101,12 @@ class BaseApiSpan(BaseModel):
|
|
|
96
101
|
metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
97
102
|
metrics_data: Optional[List[MetricData]] = Field(None, alias="metricsData")
|
|
98
103
|
|
|
99
|
-
class Config:
|
|
100
|
-
use_enum_values = True
|
|
101
|
-
validate_assignment = True
|
|
102
|
-
|
|
103
104
|
|
|
104
105
|
class TraceApi(BaseModel):
|
|
106
|
+
model_config = make_model_config(
|
|
107
|
+
use_enum_values=True, validate_assignment=True
|
|
108
|
+
)
|
|
109
|
+
|
|
105
110
|
uuid: str
|
|
106
111
|
base_spans: Optional[List[BaseApiSpan]] = Field(None, alias="baseSpans")
|
|
107
112
|
agent_spans: Optional[List[BaseApiSpan]] = Field(None, alias="agentSpans")
|
|
@@ -139,7 +144,3 @@ class TraceApi(BaseModel):
|
|
|
139
144
|
|
|
140
145
|
# Don't serialize these
|
|
141
146
|
confident_api_key: Optional[str] = Field(None, exclude=True)
|
|
142
|
-
|
|
143
|
-
class Config:
|
|
144
|
-
use_enum_values = True
|
|
145
|
-
validate_assignment = True
|