deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +258 -47
- deepeval/config/settings_manager.py +4 -0
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.
|
|
1
|
+
__version__: str = "3.7.0"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import anthropic # noqa: F401
|
|
3
|
+
except ImportError:
|
|
4
|
+
raise ModuleNotFoundError(
|
|
5
|
+
"Please install anthropic to use this feature: 'pip install anthropic'"
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from anthropic import Anthropic, AsyncAnthropic # noqa: F401
|
|
10
|
+
except ImportError:
|
|
11
|
+
Anthropic = None # type: ignore
|
|
12
|
+
AsyncAnthropic = None # type: ignore
|
|
13
|
+
|
|
14
|
+
if Anthropic or AsyncAnthropic:
|
|
15
|
+
from deepeval.anthropic.patch import patch_anthropic_classes
|
|
16
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
17
|
+
|
|
18
|
+
with capture_tracing_integration("anthropic"):
|
|
19
|
+
patch_anthropic_classes()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from anthropic.types.message import Message
|
|
2
|
+
from anthropic.types import ToolUseBlock
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from deepeval.anthropic.utils import (
|
|
6
|
+
render_messages_anthropic,
|
|
7
|
+
stringify_anthropic_content,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
10
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def safe_extract_input_parameters(kwargs: Dict[str, Any]) -> InputParameters:
|
|
14
|
+
# guarding against errors to be compatible with legacy APIs
|
|
15
|
+
try:
|
|
16
|
+
return extract_messages_api_input_parameters(kwargs)
|
|
17
|
+
except:
|
|
18
|
+
return InputParameters(model="NA")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_messages_api_input_parameters(
|
|
22
|
+
kwargs: Dict[str, Any],
|
|
23
|
+
) -> InputParameters:
|
|
24
|
+
model = kwargs.get("model")
|
|
25
|
+
tools = kwargs.get("tools")
|
|
26
|
+
messages = kwargs.get("messages")
|
|
27
|
+
tool_descriptions = (
|
|
28
|
+
{tool["name"]: tool["description"] for tool in tools}
|
|
29
|
+
if tools is not None
|
|
30
|
+
else None
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
input_argument = ""
|
|
34
|
+
user_messages = []
|
|
35
|
+
for message in messages:
|
|
36
|
+
role = message["role"]
|
|
37
|
+
if role == "user":
|
|
38
|
+
user_messages.append(message["content"])
|
|
39
|
+
if len(user_messages) > 0:
|
|
40
|
+
input_argument = user_messages[0]
|
|
41
|
+
|
|
42
|
+
return InputParameters(
|
|
43
|
+
model=model,
|
|
44
|
+
input=stringify_anthropic_content(input_argument),
|
|
45
|
+
messages=render_messages_anthropic(messages),
|
|
46
|
+
tools=tools,
|
|
47
|
+
tool_descriptions=tool_descriptions,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def safe_extract_output_parameters(
|
|
52
|
+
message_response: Message,
|
|
53
|
+
input_parameters: InputParameters,
|
|
54
|
+
) -> OutputParameters:
|
|
55
|
+
# guarding against errors to be compatible with legacy APIs
|
|
56
|
+
try:
|
|
57
|
+
return extract_messages_api_output_parameters(
|
|
58
|
+
message_response, input_parameters
|
|
59
|
+
)
|
|
60
|
+
except:
|
|
61
|
+
return OutputParameters()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_messages_api_output_parameters(
|
|
65
|
+
message_response: Message,
|
|
66
|
+
input_parameters: InputParameters,
|
|
67
|
+
) -> OutputParameters:
|
|
68
|
+
output = str(message_response.content[0].text)
|
|
69
|
+
prompt_tokens = message_response.usage.input_tokens
|
|
70
|
+
completion_tokens = message_response.usage.output_tokens
|
|
71
|
+
|
|
72
|
+
tools_called = None
|
|
73
|
+
anthropic_tool_calls = [
|
|
74
|
+
block
|
|
75
|
+
for block in message_response.content
|
|
76
|
+
if isinstance(block, ToolUseBlock)
|
|
77
|
+
]
|
|
78
|
+
if anthropic_tool_calls:
|
|
79
|
+
tools_called = []
|
|
80
|
+
tool_descriptions = input_parameters.tool_descriptions or {}
|
|
81
|
+
for tool_call in anthropic_tool_calls:
|
|
82
|
+
tools_called.append(
|
|
83
|
+
ToolCall(
|
|
84
|
+
name=tool_call.name,
|
|
85
|
+
input_parameters=tool_call.input,
|
|
86
|
+
description=tool_descriptions.get(tool_call.name),
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
return OutputParameters(
|
|
90
|
+
output=output,
|
|
91
|
+
prompt_tokens=prompt_tokens,
|
|
92
|
+
completion_tokens=completion_tokens,
|
|
93
|
+
tools_called=tools_called,
|
|
94
|
+
)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from functools import wraps
|
|
3
|
+
|
|
4
|
+
from deepeval.anthropic.extractors import (
|
|
5
|
+
safe_extract_input_parameters,
|
|
6
|
+
safe_extract_output_parameters,
|
|
7
|
+
InputParameters,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.model_integrations.utils import _update_all_attributes
|
|
10
|
+
from deepeval.tracing import observe
|
|
11
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
12
|
+
|
|
13
|
+
_ORIGINAL_METHODS = {}
|
|
14
|
+
_ANTHROPIC_PATCHED = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def patch_anthropic_classes():
|
|
18
|
+
"""
|
|
19
|
+
Monkey patch Anthropic resource classes directly.
|
|
20
|
+
"""
|
|
21
|
+
global _ANTHROPIC_PATCHED
|
|
22
|
+
|
|
23
|
+
# Single guard - if already patched, return immediately
|
|
24
|
+
if _ANTHROPIC_PATCHED:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from anthropic.resources.messages import Messages, AsyncMessages
|
|
29
|
+
|
|
30
|
+
# Store original methods before patching
|
|
31
|
+
if hasattr(Messages, "create"):
|
|
32
|
+
_ORIGINAL_METHODS["Messages.create"] = Messages.create
|
|
33
|
+
Messages.create = _create_sync_wrapper(Messages.create)
|
|
34
|
+
|
|
35
|
+
if hasattr(AsyncMessages, "create"):
|
|
36
|
+
_ORIGINAL_METHODS["AsyncMessages.create"] = AsyncMessages.create
|
|
37
|
+
AsyncMessages.create = _create_async_wrapper(AsyncMessages.create)
|
|
38
|
+
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
_ANTHROPIC_PATCHED = True
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _create_sync_wrapper(original_method):
|
|
46
|
+
"""
|
|
47
|
+
Create a wrapper for sync methods - called ONCE during patching.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@wraps(original_method)
|
|
51
|
+
def method_wrapper(self, *args, **kwargs):
|
|
52
|
+
bound_method = original_method.__get__(self, type(self))
|
|
53
|
+
patched = _patch_sync_anthropic_client_method(
|
|
54
|
+
original_method=bound_method
|
|
55
|
+
)
|
|
56
|
+
return patched(*args, **kwargs)
|
|
57
|
+
|
|
58
|
+
return method_wrapper
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _create_async_wrapper(original_method):
|
|
62
|
+
"""
|
|
63
|
+
Create a wrapper for sync methods - called ONCE during patching.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@wraps(original_method)
|
|
67
|
+
def method_wrapper(self, *args, **kwargs):
|
|
68
|
+
bound_method = original_method.__get__(self, type(self))
|
|
69
|
+
patched = _patch_async_anthropic_client_method(
|
|
70
|
+
original_method=bound_method
|
|
71
|
+
)
|
|
72
|
+
return patched(*args, **kwargs)
|
|
73
|
+
|
|
74
|
+
return method_wrapper
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _patch_sync_anthropic_client_method(original_method: Callable):
|
|
78
|
+
@wraps(original_method)
|
|
79
|
+
def patched_sync_anthropic_method(*args, **kwargs):
|
|
80
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
81
|
+
kwargs
|
|
82
|
+
)
|
|
83
|
+
llm_context = current_llm_context.get()
|
|
84
|
+
|
|
85
|
+
@observe(
|
|
86
|
+
type="llm",
|
|
87
|
+
model=input_parameters.model,
|
|
88
|
+
metrics=llm_context.metrics,
|
|
89
|
+
metric_collection=llm_context.metric_collection,
|
|
90
|
+
)
|
|
91
|
+
def llm_generation(*args, **kwargs):
|
|
92
|
+
messages_api_response = original_method(*args, **kwargs)
|
|
93
|
+
output_parameters = safe_extract_output_parameters(
|
|
94
|
+
messages_api_response, input_parameters
|
|
95
|
+
)
|
|
96
|
+
_update_all_attributes(
|
|
97
|
+
input_parameters,
|
|
98
|
+
output_parameters,
|
|
99
|
+
llm_context.expected_tools,
|
|
100
|
+
llm_context.expected_output,
|
|
101
|
+
llm_context.context,
|
|
102
|
+
llm_context.retrieval_context,
|
|
103
|
+
)
|
|
104
|
+
return messages_api_response
|
|
105
|
+
|
|
106
|
+
return llm_generation(*args, **kwargs)
|
|
107
|
+
|
|
108
|
+
return patched_sync_anthropic_method
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _patch_async_anthropic_client_method(original_method: Callable):
|
|
112
|
+
@wraps(original_method)
|
|
113
|
+
async def patched_async_anthropic_method(*args, **kwargs):
|
|
114
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
115
|
+
kwargs
|
|
116
|
+
)
|
|
117
|
+
llm_context = current_llm_context.get()
|
|
118
|
+
|
|
119
|
+
@observe(
|
|
120
|
+
type="llm",
|
|
121
|
+
model=input_parameters.model,
|
|
122
|
+
metrics=llm_context.metrics,
|
|
123
|
+
metric_collection=llm_context.metric_collection,
|
|
124
|
+
)
|
|
125
|
+
async def llm_generation(*args, **kwargs):
|
|
126
|
+
messages_api_response = await original_method(*args, **kwargs)
|
|
127
|
+
output_parameters = safe_extract_output_parameters(
|
|
128
|
+
messages_api_response, input_parameters
|
|
129
|
+
)
|
|
130
|
+
_update_all_attributes(
|
|
131
|
+
input_parameters,
|
|
132
|
+
output_parameters,
|
|
133
|
+
llm_context.expected_tools,
|
|
134
|
+
llm_context.expected_output,
|
|
135
|
+
llm_context.context,
|
|
136
|
+
llm_context.retrieval_context,
|
|
137
|
+
)
|
|
138
|
+
return messages_api_response
|
|
139
|
+
|
|
140
|
+
return await llm_generation(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
return patched_async_anthropic_method
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def unpatch_anthropic_classes():
|
|
146
|
+
"""
|
|
147
|
+
Restore Anthropic resource classes to their original state.
|
|
148
|
+
"""
|
|
149
|
+
global _ANTHROPIC_PATCHED
|
|
150
|
+
|
|
151
|
+
# If not patched, nothing to do
|
|
152
|
+
if not _ANTHROPIC_PATCHED:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
from anthropic.resources.messages import Messages, AsyncMessages
|
|
157
|
+
|
|
158
|
+
# Restore original methods for Messages
|
|
159
|
+
if hasattr(Messages, "create"):
|
|
160
|
+
Messages.create = _ORIGINAL_METHODS["Messages.create"]
|
|
161
|
+
|
|
162
|
+
if hasattr(AsyncMessages, "create"):
|
|
163
|
+
AsyncMessages.create = _ORIGINAL_METHODS["AsyncMessages.create"]
|
|
164
|
+
|
|
165
|
+
except ImportError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# Reset the patched flag
|
|
169
|
+
_ANTHROPIC_PATCHED = False
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
from typing import Any, Iterable, List
|
|
2
|
+
|
|
3
|
+
from anthropic.types import Message
|
|
4
|
+
|
|
5
|
+
from deepeval.model_integrations.utils import compact_dump, fmt_url
|
|
6
|
+
from deepeval.utils import shorten
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def stringify_anthropic_content(content: Any) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Return a short, human-readable summary string for an Anthropic-style multimodal `content` value.
|
|
12
|
+
|
|
13
|
+
This is used to populate span summaries, such as `InputParameters.input`. It never raises and
|
|
14
|
+
never returns huge blobs.
|
|
15
|
+
|
|
16
|
+
Notes:
|
|
17
|
+
- Data URIs and base64 content are redacted to "[data-uri]" or "[base64:...]".
|
|
18
|
+
- Output is capped via `deepeval.utils.shorten` (configurable through settings).
|
|
19
|
+
- Fields that are not explicitly handled are returned as size-capped JSON dumps
|
|
20
|
+
- This string is for display/summary only, not intended to be parsable.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
content: The value of an Anthropic message `content`, may be a str or list of content blocks,
|
|
24
|
+
or any nested structure.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A short, readable `str` summary.
|
|
28
|
+
"""
|
|
29
|
+
if content is None:
|
|
30
|
+
return ""
|
|
31
|
+
if isinstance(content, str):
|
|
32
|
+
return content
|
|
33
|
+
if isinstance(content, (bytes, bytearray)):
|
|
34
|
+
return f"[bytes:{len(content)}]"
|
|
35
|
+
|
|
36
|
+
# list of content blocks for Anthropic Messages API
|
|
37
|
+
if isinstance(content, list):
|
|
38
|
+
parts: List[str] = []
|
|
39
|
+
for block in content:
|
|
40
|
+
s = stringify_anthropic_content(block)
|
|
41
|
+
if s:
|
|
42
|
+
parts.append(s)
|
|
43
|
+
return "\n".join(parts)
|
|
44
|
+
|
|
45
|
+
# dict shapes for Anthropic Messages API
|
|
46
|
+
if isinstance(content, dict):
|
|
47
|
+
t = content.get("type")
|
|
48
|
+
|
|
49
|
+
# Text block
|
|
50
|
+
if t == "text":
|
|
51
|
+
return str(content.get("text", ""))
|
|
52
|
+
|
|
53
|
+
# Image block
|
|
54
|
+
if t == "image":
|
|
55
|
+
source = content.get("source", {})
|
|
56
|
+
source_type = source.get("type")
|
|
57
|
+
if source_type == "base64":
|
|
58
|
+
media_type = source.get("media_type", "unknown")
|
|
59
|
+
data = source.get("data", "")
|
|
60
|
+
data_preview = data[:20] if data else ""
|
|
61
|
+
return f"[image:{media_type}:base64:{data_preview}...]"
|
|
62
|
+
elif source_type == "url":
|
|
63
|
+
url = source.get("url", "")
|
|
64
|
+
return f"[image:{fmt_url(url)}]"
|
|
65
|
+
else:
|
|
66
|
+
return f"[image:{source_type or 'unknown'}]"
|
|
67
|
+
|
|
68
|
+
# Tool use block (in assistant messages)
|
|
69
|
+
if t == "tool_use":
|
|
70
|
+
tool_name = content.get("name", "unknown")
|
|
71
|
+
tool_id = content.get("id", "")
|
|
72
|
+
tool_input = content.get("input", {})
|
|
73
|
+
input_str = compact_dump(tool_input) if tool_input else ""
|
|
74
|
+
return f"[tool_use:{tool_name}:{tool_id}:{input_str}]"
|
|
75
|
+
|
|
76
|
+
# Tool result block (in user messages)
|
|
77
|
+
if t == "tool_result":
|
|
78
|
+
tool_id = content.get("tool_use_id", "")
|
|
79
|
+
tool_content = content.get("content")
|
|
80
|
+
content_str = (
|
|
81
|
+
stringify_anthropic_content(tool_content)
|
|
82
|
+
if tool_content
|
|
83
|
+
else ""
|
|
84
|
+
)
|
|
85
|
+
is_error = content.get("is_error", False)
|
|
86
|
+
error_flag = ":error" if is_error else ""
|
|
87
|
+
return f"[tool_result:{tool_id}{error_flag}:{content_str}]"
|
|
88
|
+
|
|
89
|
+
# Document block (for PDFs and other documents)
|
|
90
|
+
if t == "document":
|
|
91
|
+
source = content.get("source", {})
|
|
92
|
+
source_type = source.get("type")
|
|
93
|
+
if source_type == "base64":
|
|
94
|
+
media_type = source.get("media_type", "unknown")
|
|
95
|
+
return f"[document:{media_type}:base64]"
|
|
96
|
+
elif source_type == "url":
|
|
97
|
+
url = source.get("url", "")
|
|
98
|
+
return f"[document:{fmt_url(url)}]"
|
|
99
|
+
else:
|
|
100
|
+
return f"[document:{source_type or 'unknown'}]"
|
|
101
|
+
|
|
102
|
+
# Thinking block (for extended thinking models)
|
|
103
|
+
if t == "thinking":
|
|
104
|
+
thinking_text = content.get("thinking", "")
|
|
105
|
+
return {
|
|
106
|
+
"role": "thinking",
|
|
107
|
+
"content": shorten(thinking_text, max_len=100),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# readability for other block types we don't currently handle
|
|
111
|
+
if t:
|
|
112
|
+
return f"[{t}]"
|
|
113
|
+
|
|
114
|
+
# unknown dicts and types returned as shortened JSON
|
|
115
|
+
return compact_dump(content)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def render_messages_anthropic(
|
|
119
|
+
messages: Iterable[Message],
|
|
120
|
+
):
|
|
121
|
+
"""
|
|
122
|
+
Extracts and normalizes tool calls and tool results from Anthropic API messages
|
|
123
|
+
for observability/logging purposes.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
messages: Iterable of message dictionaries in Anthropic API format
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
List of normalized message objects suitable for logging/observability
|
|
130
|
+
"""
|
|
131
|
+
messages_list = []
|
|
132
|
+
|
|
133
|
+
for message in messages:
|
|
134
|
+
role = message.get("role")
|
|
135
|
+
content = message.get("content")
|
|
136
|
+
|
|
137
|
+
if role == "assistant":
|
|
138
|
+
if isinstance(content, str):
|
|
139
|
+
messages_list.append(
|
|
140
|
+
{
|
|
141
|
+
"role": role,
|
|
142
|
+
"content": content,
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
elif isinstance(content, list):
|
|
146
|
+
for block in content:
|
|
147
|
+
block_type = block.get("type")
|
|
148
|
+
if block_type == "text":
|
|
149
|
+
messages_list.append(
|
|
150
|
+
{
|
|
151
|
+
"role": role,
|
|
152
|
+
"content": block.get("text", ""),
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
elif block_type == "tool_use":
|
|
156
|
+
messages_list.append(
|
|
157
|
+
{
|
|
158
|
+
"id": block.get("id", ""),
|
|
159
|
+
"call_id": block.get("id", ""),
|
|
160
|
+
"name": block.get("name", ""),
|
|
161
|
+
"type": "function",
|
|
162
|
+
"arguments": block.get("input", {}),
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
elif role == "user":
|
|
167
|
+
if isinstance(content, str):
|
|
168
|
+
messages_list.append(
|
|
169
|
+
{
|
|
170
|
+
"role": role,
|
|
171
|
+
"content": content,
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
elif isinstance(content, list):
|
|
175
|
+
for block in content:
|
|
176
|
+
block_type = block.get("type")
|
|
177
|
+
if block_type == "text":
|
|
178
|
+
messages_list.append(
|
|
179
|
+
{
|
|
180
|
+
"role": role,
|
|
181
|
+
"content": block.get("text", ""),
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
elif block_type == "image":
|
|
185
|
+
messages_list.append(
|
|
186
|
+
{
|
|
187
|
+
"role": role,
|
|
188
|
+
"content": "[Image content]",
|
|
189
|
+
"image_source": block.get("source", {}),
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
elif block_type == "tool_result":
|
|
193
|
+
tool_content = block.get("content", "")
|
|
194
|
+
if isinstance(tool_content, list):
|
|
195
|
+
output_parts = []
|
|
196
|
+
for tool_content_block in tool_content:
|
|
197
|
+
if isinstance(tool_content_block, dict):
|
|
198
|
+
if tool_content_block.get("type") == "text":
|
|
199
|
+
output_parts.append(
|
|
200
|
+
tool_content_block.get("text", "")
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
output_parts.append(str(tool_content_block))
|
|
204
|
+
output = "\n".join(output_parts)
|
|
205
|
+
else:
|
|
206
|
+
output = tool_content
|
|
207
|
+
|
|
208
|
+
messages_list.append(
|
|
209
|
+
{
|
|
210
|
+
"call_id": block.get("tool_use_id", ""),
|
|
211
|
+
"type": "tool",
|
|
212
|
+
"output": output,
|
|
213
|
+
"is_error": block.get("is_error", False),
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
elif role == "system":
|
|
218
|
+
messages_list.append(
|
|
219
|
+
{
|
|
220
|
+
"role": role,
|
|
221
|
+
"content": content,
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return messages_list
|
deepeval/benchmarks/drop/drop.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
1
3
|
from typing import List, Optional, Dict, Union
|
|
2
4
|
from tqdm import tqdm
|
|
3
5
|
|
|
6
|
+
from deepeval.config.settings import get_settings
|
|
7
|
+
from deepeval.errors import DeepEvalError
|
|
4
8
|
from deepeval.dataset import Golden
|
|
5
9
|
from deepeval.benchmarks.base_benchmark import (
|
|
6
10
|
DeepEvalBaseBenchmark,
|
|
@@ -17,6 +21,8 @@ from deepeval.benchmarks.schema import (
|
|
|
17
21
|
)
|
|
18
22
|
from deepeval.telemetry import capture_benchmark_run
|
|
19
23
|
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
20
26
|
DELIMITER = ","
|
|
21
27
|
|
|
22
28
|
|
|
@@ -164,7 +170,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
164
170
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
165
171
|
# Define prompt template
|
|
166
172
|
assert (
|
|
167
|
-
self.shots_dataset
|
|
173
|
+
self.shots_dataset is not None
|
|
168
174
|
), "Example dataset is empty. Call load_benchmark."
|
|
169
175
|
prompt: dict = DROPTemplate.generate_output(
|
|
170
176
|
train_set=self.shots_dataset,
|
|
@@ -206,7 +212,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
206
212
|
) -> List[Dict]:
|
|
207
213
|
# Define prompt template
|
|
208
214
|
assert (
|
|
209
|
-
self.shots_dataset
|
|
215
|
+
self.shots_dataset is not None
|
|
210
216
|
), "Example dataset is empty. Call load_benchmark."
|
|
211
217
|
|
|
212
218
|
prompts = []
|
|
@@ -215,7 +221,6 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
215
221
|
prompt: dict = DROPTemplate.generate_output(
|
|
216
222
|
train_set=self.shots_dataset,
|
|
217
223
|
input=golden.input,
|
|
218
|
-
type=golden.context[0],
|
|
219
224
|
n_shots=self.n_shots,
|
|
220
225
|
)
|
|
221
226
|
prompts.append(prompt)
|
|
@@ -228,23 +233,44 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
228
233
|
schema = DROPStringSchema
|
|
229
234
|
schemas.append(schema)
|
|
230
235
|
|
|
236
|
+
effective_batch_size = len(goldens)
|
|
237
|
+
model_name = getattr(
|
|
238
|
+
model, "get_model_name", lambda: type(model).__name__
|
|
239
|
+
)()
|
|
240
|
+
|
|
231
241
|
try:
|
|
232
242
|
responses: List[
|
|
233
243
|
Union[DROPNumberSchema, DROPDateSchema, DROPStringSchema]
|
|
234
244
|
] = model.batch_generate(prompts=prompts, schemas=schemas)
|
|
235
245
|
predictions = [str(res.answer) for res in responses]
|
|
236
|
-
except
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
246
|
+
except (AttributeError, NotImplementedError) as e:
|
|
247
|
+
logger.error(
|
|
248
|
+
"DROP: model %s does not implement batch_generate. Batch evaluation "
|
|
249
|
+
"(effective batch_size=%s) requires a batch-capable model. "
|
|
250
|
+
"Use a model that implements batch_generate(prompts, schemas) or run with batch_size=0/None.",
|
|
251
|
+
model_name,
|
|
252
|
+
effective_batch_size,
|
|
253
|
+
exc_info=get_settings().DEEPEVAL_LOG_STACK_TRACES,
|
|
254
|
+
)
|
|
255
|
+
raise DeepEvalError(
|
|
256
|
+
"Model does not implement batch_generate. Use a batch-capable model or set batch_size=0/None."
|
|
257
|
+
) from e
|
|
258
|
+
|
|
259
|
+
except TypeError as e:
|
|
260
|
+
logger.error(
|
|
261
|
+
"DROP: model %s does not support schema-aware batch generation "
|
|
262
|
+
"(batch_generate(prompts, schemas)). DROP requires structured outputs "
|
|
263
|
+
"for number/date/span. Use a model that supports schemas or run with batch_size=0/None.",
|
|
264
|
+
model_name,
|
|
265
|
+
exc_info=get_settings().DEEPEVAL_LOG_STACK_TRACES,
|
|
266
|
+
)
|
|
267
|
+
raise DeepEvalError(
|
|
268
|
+
"Model does not support schema-aware batch generation required by DROP. "
|
|
269
|
+
"Use batch_generate(prompts, schemas) or set batch_size=0/None."
|
|
270
|
+
) from e
|
|
245
271
|
|
|
246
|
-
if len(predictions)
|
|
247
|
-
raise
|
|
272
|
+
if len(predictions) != effective_batch_size:
|
|
273
|
+
raise DeepEvalError(
|
|
248
274
|
"Custom `batch_generate` method did not return the same number of generations as the number of prompts."
|
|
249
275
|
)
|
|
250
276
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from pydantic.config import ConfigDict
|
|
2
1
|
from deepeval.benchmarks.base_benchmark import (
|
|
3
2
|
DeepEvalBaseBenchmark,
|
|
4
3
|
DeepEvalBaseBenchmarkResult,
|
|
5
4
|
)
|
|
5
|
+
from deepeval.utils import make_model_config
|
|
6
6
|
from typing import List, Optional, Dict, Any, Tuple
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
import re
|
|
@@ -19,7 +19,7 @@ from deepeval.telemetry import capture_benchmark_run
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class IFEvalResult(DeepEvalBaseBenchmarkResult):
|
|
22
|
-
model_config =
|
|
22
|
+
model_config = make_model_config(arbitrary_types_allowed=True)
|
|
23
23
|
instruction_breakdown: dict[str, Any]
|
|
24
24
|
predictions: "pd.DataFrame"
|
|
25
25
|
|
deepeval/confident/types.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
1
|
+
from pydantic import BaseModel
|
|
2
2
|
from typing import Any, Optional
|
|
3
3
|
|
|
4
|
+
from deepeval.utils import make_model_config
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
class ApiResponse(BaseModel):
|
|
6
|
-
model_config =
|
|
8
|
+
model_config = make_model_config(extra="ignore")
|
|
7
9
|
|
|
8
10
|
success: bool
|
|
9
11
|
data: Optional[Any] = None
|