deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +159 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/evaluate/compare.py +215 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/metrics/utils.py +1 -1
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +27 -15
- deepeval/simulator/template.py +1 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +35 -13
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +52 -14
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +11 -2
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +48 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Any, List, Optional
|
|
4
|
+
|
|
5
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
6
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
from deepeval.tracing.context import (
|
|
8
|
+
current_span_context,
|
|
9
|
+
current_trace_context,
|
|
10
|
+
update_current_span,
|
|
11
|
+
update_llm_span,
|
|
12
|
+
)
|
|
13
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
14
|
+
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
15
|
+
from deepeval.utils import shorten, len_long
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _update_all_attributes(
|
|
19
|
+
input_parameters: InputParameters,
|
|
20
|
+
output_parameters: OutputParameters,
|
|
21
|
+
expected_tools: List[ToolCall],
|
|
22
|
+
expected_output: str,
|
|
23
|
+
context: List[str],
|
|
24
|
+
retrieval_context: List[str],
|
|
25
|
+
):
|
|
26
|
+
"""Update span and trace attributes with input/output parameters."""
|
|
27
|
+
update_current_span(
|
|
28
|
+
input=input_parameters.input or input_parameters.messages or "NA",
|
|
29
|
+
output=output_parameters.output or "NA",
|
|
30
|
+
tools_called=output_parameters.tools_called,
|
|
31
|
+
# attributes to be added
|
|
32
|
+
expected_output=expected_output,
|
|
33
|
+
expected_tools=expected_tools,
|
|
34
|
+
context=context,
|
|
35
|
+
retrieval_context=retrieval_context,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
llm_context = current_llm_context.get()
|
|
39
|
+
|
|
40
|
+
update_llm_span(
|
|
41
|
+
input_token_count=output_parameters.prompt_tokens,
|
|
42
|
+
output_token_count=output_parameters.completion_tokens,
|
|
43
|
+
prompt=llm_context.prompt,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if output_parameters.tools_called:
|
|
47
|
+
create_child_tool_spans(output_parameters)
|
|
48
|
+
|
|
49
|
+
__update_input_and_output_of_current_trace(
|
|
50
|
+
input_parameters, output_parameters
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def __update_input_and_output_of_current_trace(
|
|
55
|
+
input_parameters: InputParameters, output_parameters: OutputParameters
|
|
56
|
+
):
|
|
57
|
+
|
|
58
|
+
current_trace = current_trace_context.get()
|
|
59
|
+
if current_trace:
|
|
60
|
+
if current_trace.input is None:
|
|
61
|
+
current_trace.input = (
|
|
62
|
+
input_parameters.input or input_parameters.messages
|
|
63
|
+
)
|
|
64
|
+
if current_trace.output is None:
|
|
65
|
+
current_trace.output = output_parameters.output
|
|
66
|
+
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
71
|
+
if output_parameters.tools_called is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
current_span = current_span_context.get()
|
|
75
|
+
for tool_called in output_parameters.tools_called:
|
|
76
|
+
tool_span = ToolSpan(
|
|
77
|
+
**{
|
|
78
|
+
"uuid": str(uuid.uuid4()),
|
|
79
|
+
"trace_uuid": current_span.trace_uuid,
|
|
80
|
+
"parent_uuid": current_span.uuid,
|
|
81
|
+
"start_time": current_span.start_time,
|
|
82
|
+
"end_time": current_span.start_time,
|
|
83
|
+
"status": TraceSpanStatus.SUCCESS,
|
|
84
|
+
"children": [],
|
|
85
|
+
"name": tool_called.name,
|
|
86
|
+
"input": tool_called.input_parameters,
|
|
87
|
+
"output": None,
|
|
88
|
+
"metrics": None,
|
|
89
|
+
"description": tool_called.description,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
current_span.children.append(tool_span)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_URL_MAX = 200
|
|
96
|
+
_JSON_MAX = max(
|
|
97
|
+
len_long(), 400
|
|
98
|
+
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def compact_dump(value: Any) -> str:
|
|
102
|
+
try:
|
|
103
|
+
dumped = json.dumps(
|
|
104
|
+
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
105
|
+
)
|
|
106
|
+
except Exception:
|
|
107
|
+
dumped = repr(value)
|
|
108
|
+
return shorten(dumped, max_len=_JSON_MAX)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def fmt_url(url: Optional[str]) -> str:
|
|
112
|
+
if not url:
|
|
113
|
+
return ""
|
|
114
|
+
if url.startswith("data:"):
|
|
115
|
+
return "[data-uri]"
|
|
116
|
+
return shorten(url, max_len=_URL_MAX)
|
deepeval/models/base_model.py
CHANGED
|
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
|
|
|
68
68
|
Returns:
|
|
69
69
|
A list of strings.
|
|
70
70
|
"""
|
|
71
|
-
raise
|
|
71
|
+
raise NotImplementedError(
|
|
72
|
+
"batch_generate is not implemented for this model"
|
|
73
|
+
)
|
|
72
74
|
|
|
73
75
|
@abstractmethod
|
|
74
76
|
def get_model_name(self, *args, **kwargs) -> str:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from google.genai import types
|
|
2
|
+
from google.genai import types, Client
|
|
3
3
|
from typing import Optional, Dict
|
|
4
|
-
from google import genai
|
|
5
4
|
|
|
6
5
|
from deepeval.models.retry_policy import (
|
|
7
6
|
create_retry_decorator,
|
|
@@ -9,7 +8,8 @@ from deepeval.models.retry_policy import (
|
|
|
9
8
|
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
10
9
|
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
11
10
|
from deepeval.constants import ProviderSlug as PS
|
|
12
|
-
|
|
11
|
+
from google.oauth2 import service_account
|
|
12
|
+
import json
|
|
13
13
|
|
|
14
14
|
default_gemini_model = "gemini-1.5-pro"
|
|
15
15
|
|
|
@@ -52,6 +52,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
52
52
|
api_key: Optional[str] = None,
|
|
53
53
|
project: Optional[str] = None,
|
|
54
54
|
location: Optional[str] = None,
|
|
55
|
+
service_account_key: Optional[Dict[str, str]] = None,
|
|
55
56
|
temperature: float = 0,
|
|
56
57
|
generation_kwargs: Optional[Dict] = None,
|
|
57
58
|
**kwargs,
|
|
@@ -75,6 +76,17 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
75
76
|
self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
|
|
76
77
|
ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
|
|
77
78
|
)
|
|
79
|
+
if service_account_key:
|
|
80
|
+
self.service_account_key = service_account_key
|
|
81
|
+
else:
|
|
82
|
+
service_account_key_data = KEY_FILE_HANDLER.fetch_data(
|
|
83
|
+
ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
|
|
84
|
+
)
|
|
85
|
+
if service_account_key_data is None:
|
|
86
|
+
self.service_account_key = None
|
|
87
|
+
elif isinstance(service_account_key_data, str):
|
|
88
|
+
self.service_account_key = json.loads(service_account_key_data)
|
|
89
|
+
|
|
78
90
|
if temperature < 0:
|
|
79
91
|
raise ValueError("Temperature must be >= 0.")
|
|
80
92
|
self.temperature = temperature
|
|
@@ -117,10 +129,20 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
117
129
|
)
|
|
118
130
|
|
|
119
131
|
# Create client for Vertex AI
|
|
120
|
-
self.client =
|
|
132
|
+
self.client = Client(
|
|
121
133
|
vertexai=True,
|
|
122
134
|
project=self.project,
|
|
123
135
|
location=self.location,
|
|
136
|
+
credentials=(
|
|
137
|
+
service_account.Credentials.from_service_account_info(
|
|
138
|
+
self.service_account_key,
|
|
139
|
+
scopes=[
|
|
140
|
+
"https://www.googleapis.com/auth/cloud-platform"
|
|
141
|
+
],
|
|
142
|
+
)
|
|
143
|
+
if self.service_account_key
|
|
144
|
+
else None
|
|
145
|
+
),
|
|
124
146
|
**self.kwargs,
|
|
125
147
|
)
|
|
126
148
|
else:
|
|
@@ -130,7 +152,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
130
152
|
"or set it in your DeepEval configuration."
|
|
131
153
|
)
|
|
132
154
|
# Create client for Gemini API
|
|
133
|
-
self.client =
|
|
155
|
+
self.client = Client(api_key=self.api_key, **self.kwargs)
|
|
134
156
|
|
|
135
157
|
# Configure default model generation settings
|
|
136
158
|
self.model_safety_settings = [
|
deepeval/openai/__init__.py
CHANGED
|
@@ -15,5 +15,7 @@ except ImportError:
|
|
|
15
15
|
|
|
16
16
|
if OpenAI or AsyncOpenAI:
|
|
17
17
|
from deepeval.openai.patch import patch_openai_classes
|
|
18
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
with capture_tracing_integration("openai"):
|
|
21
|
+
patch_openai_classes()
|
deepeval/openai/extractors.py
CHANGED
|
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
|
|
|
4
4
|
from openai.types.responses import Response
|
|
5
5
|
|
|
6
6
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
|
|
8
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
7
9
|
from deepeval.openai.utils import (
|
|
8
10
|
render_response_input,
|
|
9
11
|
stringify_multimodal_content,
|
|
10
12
|
render_messages,
|
|
11
13
|
)
|
|
12
|
-
from deepeval.openai.types import InputParameters, OutputParameters
|
|
13
|
-
from deepeval.tracing.types import Message
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
# guarding against errors to be compatible with legacy APIs
|
deepeval/openai/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import Any, Dict, List,
|
|
3
|
+
from typing import Any, Dict, List, Iterable
|
|
4
4
|
|
|
5
5
|
from openai.types.chat.chat_completion_message_param import (
|
|
6
6
|
ChatCompletionMessageParam,
|
|
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
|
|
|
8
8
|
|
|
9
9
|
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
10
10
|
from deepeval.tracing.context import current_span_context
|
|
11
|
-
from deepeval.
|
|
12
|
-
from deepeval.
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
_URL_MAX = 200
|
|
16
|
-
_JSON_MAX = max(
|
|
17
|
-
len_long(), 400
|
|
18
|
-
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _compact_dump(value: Any) -> str:
|
|
22
|
-
try:
|
|
23
|
-
dumped = json.dumps(
|
|
24
|
-
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
25
|
-
)
|
|
26
|
-
except Exception:
|
|
27
|
-
dumped = repr(value)
|
|
28
|
-
return shorten(dumped, max_len=_JSON_MAX)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _fmt_url(url: Optional[str]) -> str:
|
|
32
|
-
if not url:
|
|
33
|
-
return ""
|
|
34
|
-
if url.startswith("data:"):
|
|
35
|
-
return "[data-uri]"
|
|
36
|
-
return shorten(url, max_len=_URL_MAX)
|
|
11
|
+
from deepeval.model_integrations.types import OutputParameters
|
|
12
|
+
from deepeval.model_integrations.utils import compact_dump, fmt_url
|
|
37
13
|
|
|
38
14
|
|
|
39
15
|
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
111
87
|
url = image_url
|
|
112
88
|
else:
|
|
113
89
|
url = (image_url or {}).get("url") or content.get("url")
|
|
114
|
-
return f"[image:{
|
|
90
|
+
return f"[image:{fmt_url(url)}]"
|
|
115
91
|
|
|
116
92
|
# Responses API variants
|
|
117
93
|
if t == "input_text":
|
|
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
122
98
|
url = image_url
|
|
123
99
|
else:
|
|
124
100
|
url = (image_url or {}).get("url") or content.get("url")
|
|
125
|
-
return f"[image:{
|
|
101
|
+
return f"[image:{fmt_url(url)}]"
|
|
126
102
|
|
|
127
103
|
# readability for other input_* types we don't currently handle
|
|
128
104
|
if t and t.startswith("input_"):
|
|
129
105
|
return f"[{t}]"
|
|
130
106
|
|
|
131
107
|
# unknown dicts and types returned as shortened JSON
|
|
132
|
-
return
|
|
108
|
+
return compact_dump(content)
|
|
133
109
|
|
|
134
110
|
|
|
135
111
|
def render_messages(
|
|
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
|
|
|
228
204
|
lines.append(f"{prefix}{key}:")
|
|
229
205
|
lines.append(_render_content(value, indent + 1))
|
|
230
206
|
elif isinstance(value, list):
|
|
231
|
-
lines.append(f"{prefix}{key}: {
|
|
207
|
+
lines.append(f"{prefix}{key}: {compact_dump(value)}")
|
|
232
208
|
else:
|
|
233
209
|
lines.append(f"{prefix}{key}: {value}")
|
|
234
210
|
|
|
@@ -1,13 +1,21 @@
|
|
|
1
|
+
from time import perf_counter
|
|
2
|
+
|
|
1
3
|
from deepeval.tracing.tracing import (
|
|
2
4
|
Observer,
|
|
3
5
|
current_span_context,
|
|
4
6
|
trace_manager,
|
|
5
7
|
)
|
|
6
|
-
from deepeval.openai_agents.extractors import
|
|
8
|
+
from deepeval.openai_agents.extractors import (
|
|
9
|
+
update_span_properties,
|
|
10
|
+
update_trace_properties_from_span_data,
|
|
11
|
+
)
|
|
7
12
|
from deepeval.tracing.context import current_trace_context
|
|
8
13
|
from deepeval.tracing.utils import make_json_serializable
|
|
9
|
-
from
|
|
10
|
-
|
|
14
|
+
from deepeval.tracing.types import (
|
|
15
|
+
BaseSpan,
|
|
16
|
+
LlmSpan,
|
|
17
|
+
TraceSpanStatus,
|
|
18
|
+
)
|
|
11
19
|
|
|
12
20
|
try:
|
|
13
21
|
from agents.tracing import Span, Trace, TracingProcessor
|
|
@@ -18,6 +26,7 @@ try:
|
|
|
18
26
|
GenerationSpanData,
|
|
19
27
|
GuardrailSpanData,
|
|
20
28
|
HandoffSpanData,
|
|
29
|
+
MCPListToolsSpanData,
|
|
21
30
|
ResponseSpanData,
|
|
22
31
|
SpanData,
|
|
23
32
|
)
|
deepeval/prompt/api.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field, AliasChoices
|
|
1
|
+
from pydantic import BaseModel, Field, AliasChoices, ConfigDict
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
from pydantic import TypeAdapter
|
|
5
5
|
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
7
|
+
|
|
6
8
|
###################################
|
|
7
9
|
# Model Settings
|
|
8
10
|
###################################
|
|
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
|
|
|
92
94
|
|
|
93
95
|
|
|
94
96
|
class OutputSchemaField(BaseModel):
|
|
97
|
+
model_config = make_model_config(use_enum_values=True)
|
|
98
|
+
|
|
95
99
|
id: str
|
|
96
100
|
type: SchemaDataType
|
|
97
101
|
name: str
|
|
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
|
|
|
102
106
|
validation_alias=AliasChoices("parent_id", "parentId"),
|
|
103
107
|
)
|
|
104
108
|
|
|
105
|
-
class Config:
|
|
106
|
-
use_enum_values = True
|
|
107
|
-
|
|
108
109
|
|
|
109
110
|
class OutputSchema(BaseModel):
|
|
110
111
|
fields: Optional[List[OutputSchemaField]] = None
|
|
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
|
|
|
187
188
|
|
|
188
189
|
|
|
189
190
|
class PromptPushRequest(BaseModel):
|
|
191
|
+
model_config = make_model_config(use_enum_values=True)
|
|
192
|
+
|
|
193
|
+
model_config = ConfigDict(use_enum_values=True)
|
|
194
|
+
|
|
190
195
|
alias: str
|
|
191
196
|
text: Optional[str] = None
|
|
192
197
|
messages: Optional[List[PromptMessage]] = None
|
|
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
|
|
|
203
208
|
default=None, serialization_alias="outputType"
|
|
204
209
|
)
|
|
205
210
|
|
|
206
|
-
class Config:
|
|
207
|
-
use_enum_values = True
|
|
208
|
-
|
|
209
211
|
|
|
210
212
|
class PromptUpdateRequest(BaseModel):
|
|
213
|
+
model_config = make_model_config(use_enum_values=True)
|
|
214
|
+
|
|
211
215
|
text: Optional[str] = None
|
|
212
216
|
messages: Optional[List[PromptMessage]] = None
|
|
213
217
|
interpolation_type: PromptInterpolationType = Field(
|
|
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
|
|
|
223
227
|
default=None, serialization_alias="outputType"
|
|
224
228
|
)
|
|
225
229
|
|
|
226
|
-
class Config:
|
|
227
|
-
use_enum_values = True
|
|
228
|
-
|
|
229
230
|
|
|
230
231
|
class PromptApi(BaseModel):
|
|
231
232
|
id: str
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
1
6
|
from enum import Enum
|
|
2
7
|
from typing import Optional, List, Dict, Type, Literal
|
|
3
8
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
@@ -7,9 +12,10 @@ import json
|
|
|
7
12
|
import os
|
|
8
13
|
from pydantic import BaseModel, ValidationError
|
|
9
14
|
import asyncio
|
|
10
|
-
import portalocker
|
|
11
15
|
import threading
|
|
12
16
|
|
|
17
|
+
from deepeval.utils import make_model_config, is_read_only_env
|
|
18
|
+
|
|
13
19
|
from deepeval.prompt.api import (
|
|
14
20
|
PromptHttpResponse,
|
|
15
21
|
PromptMessage,
|
|
@@ -22,9 +28,6 @@ from deepeval.prompt.api import (
|
|
|
22
28
|
ModelSettings,
|
|
23
29
|
OutputSchema,
|
|
24
30
|
OutputType,
|
|
25
|
-
ReasoningEffort,
|
|
26
|
-
Verbosity,
|
|
27
|
-
ModelProvider,
|
|
28
31
|
)
|
|
29
32
|
from deepeval.prompt.utils import (
|
|
30
33
|
interpolate_text,
|
|
@@ -34,6 +37,18 @@ from deepeval.prompt.utils import (
|
|
|
34
37
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
35
38
|
from deepeval.constants import HIDDEN_DIR
|
|
36
39
|
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
portalocker = None
|
|
44
|
+
if not is_read_only_env():
|
|
45
|
+
try:
|
|
46
|
+
import portalocker
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.warning("failed to import portalocker: %s", e)
|
|
49
|
+
else:
|
|
50
|
+
logger.warning("READ_ONLY filesystem: skipping disk cache for prompts.")
|
|
51
|
+
|
|
37
52
|
CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
|
|
38
53
|
VERSION_CACHE_KEY = "version"
|
|
39
54
|
LABEL_CACHE_KEY = "label"
|
|
@@ -77,6 +92,8 @@ class CustomEncoder(json.JSONEncoder):
|
|
|
77
92
|
|
|
78
93
|
|
|
79
94
|
class CachedPrompt(BaseModel):
|
|
95
|
+
model_config = make_model_config(use_enum_values=True)
|
|
96
|
+
|
|
80
97
|
alias: str
|
|
81
98
|
version: str
|
|
82
99
|
label: Optional[str] = None
|
|
@@ -89,9 +106,6 @@ class CachedPrompt(BaseModel):
|
|
|
89
106
|
output_type: Optional[OutputType]
|
|
90
107
|
output_schema: Optional[OutputSchema]
|
|
91
108
|
|
|
92
|
-
class Config:
|
|
93
|
-
use_enum_values = True
|
|
94
|
-
|
|
95
109
|
|
|
96
110
|
class Prompt:
|
|
97
111
|
|
|
@@ -164,7 +178,7 @@ class Prompt:
|
|
|
164
178
|
content = f.read()
|
|
165
179
|
try:
|
|
166
180
|
data = json.loads(content)
|
|
167
|
-
except:
|
|
181
|
+
except (json.JSONDecodeError, TypeError):
|
|
168
182
|
self.text_template = content
|
|
169
183
|
return content
|
|
170
184
|
|
|
@@ -202,7 +216,6 @@ class Prompt:
|
|
|
202
216
|
"Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
|
|
203
217
|
)
|
|
204
218
|
|
|
205
|
-
print("@@@@@")
|
|
206
219
|
return interpolate_text(interpolation_type, text_template, **kwargs)
|
|
207
220
|
|
|
208
221
|
elif prompt_type == PromptType.LIST:
|
|
@@ -247,7 +260,7 @@ class Prompt:
|
|
|
247
260
|
version: Optional[str] = None,
|
|
248
261
|
label: Optional[str] = None,
|
|
249
262
|
) -> Optional[CachedPrompt]:
|
|
250
|
-
if not os.path.exists(CACHE_FILE_NAME):
|
|
263
|
+
if portalocker is None or not os.path.exists(CACHE_FILE_NAME):
|
|
251
264
|
return None
|
|
252
265
|
|
|
253
266
|
try:
|
|
@@ -295,13 +308,12 @@ class Prompt:
|
|
|
295
308
|
output_type: Optional[OutputType] = None,
|
|
296
309
|
output_schema: Optional[OutputSchema] = None,
|
|
297
310
|
):
|
|
298
|
-
if not self.alias:
|
|
311
|
+
if portalocker is None or not self.alias:
|
|
299
312
|
return
|
|
300
313
|
|
|
301
|
-
# Ensure directory exists
|
|
302
|
-
os.makedirs(HIDDEN_DIR, exist_ok=True)
|
|
303
|
-
|
|
304
314
|
try:
|
|
315
|
+
# Ensure directory exists
|
|
316
|
+
os.makedirs(HIDDEN_DIR, exist_ok=True)
|
|
305
317
|
# Use r+ mode if file exists, w mode if it doesn't
|
|
306
318
|
mode = "r+" if os.path.exists(CACHE_FILE_NAME) else "w"
|
|
307
319
|
|
|
@@ -480,7 +492,7 @@ class Prompt:
|
|
|
480
492
|
cached_prompt.output_schema
|
|
481
493
|
)
|
|
482
494
|
return
|
|
483
|
-
except:
|
|
495
|
+
except Exception:
|
|
484
496
|
pass
|
|
485
497
|
|
|
486
498
|
api = Api()
|
deepeval/simulator/template.py
CHANGED
|
@@ -112,7 +112,7 @@ class ConversationSimulatorTemplate:
|
|
|
112
112
|
]
|
|
113
113
|
Example JSON Output:
|
|
114
114
|
{{
|
|
115
|
-
"is_complete":
|
|
115
|
+
"is_complete": false,
|
|
116
116
|
"reason": "The assistant explained how to forget password but ahas not confirmed that the user successfully set a new password."
|
|
117
117
|
}}
|
|
118
118
|
|
deepeval/telemetry.py
CHANGED
|
@@ -3,12 +3,12 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import socket
|
|
5
5
|
import sys
|
|
6
|
-
from threading import Event
|
|
7
6
|
import uuid
|
|
8
7
|
import sentry_sdk
|
|
9
8
|
from enum import Enum
|
|
10
9
|
from typing import List, Dict
|
|
11
10
|
import requests
|
|
11
|
+
from deepeval.config.settings import get_settings
|
|
12
12
|
from deepeval.constants import LOGIN_PROMPT, HIDDEN_DIR, KEY_FILE
|
|
13
13
|
from posthog import Posthog
|
|
14
14
|
|
|
@@ -34,7 +34,7 @@ TELEMETRY_PATH = os.path.join(HIDDEN_DIR, TELEMETRY_DATA_FILE)
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def telemetry_opt_out():
|
|
37
|
-
return
|
|
37
|
+
return get_settings().DEEPEVAL_TELEMETRY_OPT_OUT
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def blocked_by_firewall():
|
|
@@ -131,7 +131,7 @@ if not telemetry_opt_out():
|
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
if (
|
|
134
|
-
|
|
134
|
+
get_settings().ERROR_REPORTING
|
|
135
135
|
and not blocked_by_firewall()
|
|
136
136
|
and not telemetry_opt_out()
|
|
137
137
|
):
|
deepeval/test_case/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ from .conversational_test_case import (
|
|
|
10
10
|
TurnParams,
|
|
11
11
|
)
|
|
12
12
|
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
|
|
13
|
-
from .arena_test_case import ArenaTestCase
|
|
13
|
+
from .arena_test_case import ArenaTestCase, Contestant
|
|
14
14
|
from .mcp import (
|
|
15
15
|
MCPServer,
|
|
16
16
|
MCPPromptCall,
|
|
@@ -35,4 +35,5 @@ __all__ = [
|
|
|
35
35
|
"MLLMTestCaseParams",
|
|
36
36
|
"MLLMImage",
|
|
37
37
|
"ArenaTestCase",
|
|
38
|
+
"Contestant",
|
|
38
39
|
]
|
|
@@ -1,20 +1,31 @@
|
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
3
5
|
from deepeval.test_case import (
|
|
4
6
|
LLMTestCase,
|
|
5
7
|
)
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Contestant(BaseModel):
|
|
12
|
+
name: str
|
|
13
|
+
test_case: LLMTestCase
|
|
14
|
+
hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None
|
|
15
|
+
|
|
16
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
@dataclass
|
|
9
20
|
class ArenaTestCase:
|
|
10
|
-
contestants:
|
|
21
|
+
contestants: List[Contestant]
|
|
11
22
|
|
|
12
23
|
def __post_init__(self):
|
|
13
|
-
contestant_names =
|
|
24
|
+
contestant_names = [contestant.name for contestant in self.contestants]
|
|
14
25
|
if len(contestant_names) != len(set(contestant_names)):
|
|
15
26
|
raise ValueError("All contestant names must be unique.")
|
|
16
27
|
|
|
17
|
-
cases =
|
|
28
|
+
cases = [contestant.test_case for contestant in self.contestants]
|
|
18
29
|
ref_input = cases[0].input
|
|
19
30
|
for case in cases[1:]:
|
|
20
31
|
if case.input != ref_input:
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from pydantic import (
|
|
2
|
-
ConfigDict,
|
|
3
2
|
Field,
|
|
4
3
|
BaseModel,
|
|
5
4
|
model_validator,
|
|
@@ -11,6 +10,8 @@ from enum import Enum
|
|
|
11
10
|
import json
|
|
12
11
|
import uuid
|
|
13
12
|
|
|
13
|
+
from deepeval.utils import make_model_config
|
|
14
|
+
|
|
14
15
|
from deepeval.test_case.mcp import (
|
|
15
16
|
MCPServer,
|
|
16
17
|
MCPPromptCall,
|
|
@@ -156,7 +157,7 @@ class ToolCall(BaseModel):
|
|
|
156
157
|
|
|
157
158
|
|
|
158
159
|
class LLMTestCase(BaseModel):
|
|
159
|
-
model_config =
|
|
160
|
+
model_config = make_model_config(extra="ignore")
|
|
160
161
|
|
|
161
162
|
input: str
|
|
162
163
|
actual_output: Optional[str] = Field(
|