deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +258 -47
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/config/utils.py +5 -0
  12. deepeval/dataset/dataset.py +162 -30
  13. deepeval/dataset/utils.py +41 -13
  14. deepeval/evaluate/execute.py +1099 -633
  15. deepeval/integrations/crewai/handler.py +36 -0
  16. deepeval/integrations/langchain/callback.py +27 -2
  17. deepeval/integrations/llama_index/handler.py +58 -4
  18. deepeval/integrations/llama_index/utils.py +24 -0
  19. deepeval/metrics/__init__.py +5 -0
  20. deepeval/metrics/exact_match/__init__.py +0 -0
  21. deepeval/metrics/exact_match/exact_match.py +94 -0
  22. deepeval/metrics/indicator.py +21 -1
  23. deepeval/metrics/pattern_match/__init__.py +0 -0
  24. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  25. deepeval/metrics/task_completion/task_completion.py +9 -2
  26. deepeval/model_integrations/__init__.py +0 -0
  27. deepeval/model_integrations/utils.py +116 -0
  28. deepeval/models/base_model.py +3 -1
  29. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  30. deepeval/models/llms/openai_model.py +10 -1
  31. deepeval/models/retry_policy.py +103 -20
  32. deepeval/openai/__init__.py +3 -1
  33. deepeval/openai/extractors.py +2 -2
  34. deepeval/openai/utils.py +7 -31
  35. deepeval/prompt/api.py +11 -10
  36. deepeval/prompt/prompt.py +5 -4
  37. deepeval/simulator/conversation_simulator.py +25 -18
  38. deepeval/synthesizer/chunking/context_generator.py +9 -1
  39. deepeval/telemetry.py +3 -3
  40. deepeval/test_case/llm_test_case.py +3 -2
  41. deepeval/test_run/api.py +3 -2
  42. deepeval/test_run/cache.py +4 -3
  43. deepeval/test_run/test_run.py +24 -5
  44. deepeval/tracing/api.py +11 -10
  45. deepeval/tracing/otel/exporter.py +11 -0
  46. deepeval/tracing/patchers.py +102 -1
  47. deepeval/tracing/trace_context.py +13 -4
  48. deepeval/tracing/tracing.py +10 -1
  49. deepeval/tracing/types.py +8 -8
  50. deepeval/tracing/utils.py +9 -0
  51. deepeval/utils.py +44 -2
  52. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  53. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
  54. /deepeval/{openai → model_integrations}/types.py +0 -0
  55. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  57. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.8"
1
+ __version__: str = "3.7.0"
@@ -0,0 +1,19 @@
1
+ try:
2
+ import anthropic # noqa: F401
3
+ except ImportError:
4
+ raise ModuleNotFoundError(
5
+ "Please install anthropic to use this feature: 'pip install anthropic'"
6
+ )
7
+
8
+ try:
9
+ from anthropic import Anthropic, AsyncAnthropic # noqa: F401
10
+ except ImportError:
11
+ Anthropic = None # type: ignore
12
+ AsyncAnthropic = None # type: ignore
13
+
14
+ if Anthropic or AsyncAnthropic:
15
+ from deepeval.anthropic.patch import patch_anthropic_classes
16
+ from deepeval.telemetry import capture_tracing_integration
17
+
18
+ with capture_tracing_integration("anthropic"):
19
+ patch_anthropic_classes()
@@ -0,0 +1,94 @@
1
+ from anthropic.types.message import Message
2
+ from anthropic.types import ToolUseBlock
3
+ from typing import Any, Dict
4
+
5
+ from deepeval.anthropic.utils import (
6
+ render_messages_anthropic,
7
+ stringify_anthropic_content,
8
+ )
9
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
10
+ from deepeval.test_case.llm_test_case import ToolCall
11
+
12
+
13
+ def safe_extract_input_parameters(kwargs: Dict[str, Any]) -> InputParameters:
14
+ # guarding against errors to be compatible with legacy APIs
15
+ try:
16
+ return extract_messages_api_input_parameters(kwargs)
17
+ except:
18
+ return InputParameters(model="NA")
19
+
20
+
21
+ def extract_messages_api_input_parameters(
22
+ kwargs: Dict[str, Any],
23
+ ) -> InputParameters:
24
+ model = kwargs.get("model")
25
+ tools = kwargs.get("tools")
26
+ messages = kwargs.get("messages")
27
+ tool_descriptions = (
28
+ {tool["name"]: tool["description"] for tool in tools}
29
+ if tools is not None
30
+ else None
31
+ )
32
+
33
+ input_argument = ""
34
+ user_messages = []
35
+ for message in messages:
36
+ role = message["role"]
37
+ if role == "user":
38
+ user_messages.append(message["content"])
39
+ if len(user_messages) > 0:
40
+ input_argument = user_messages[0]
41
+
42
+ return InputParameters(
43
+ model=model,
44
+ input=stringify_anthropic_content(input_argument),
45
+ messages=render_messages_anthropic(messages),
46
+ tools=tools,
47
+ tool_descriptions=tool_descriptions,
48
+ )
49
+
50
+
51
+ def safe_extract_output_parameters(
52
+ message_response: Message,
53
+ input_parameters: InputParameters,
54
+ ) -> OutputParameters:
55
+ # guarding against errors to be compatible with legacy APIs
56
+ try:
57
+ return extract_messages_api_output_parameters(
58
+ message_response, input_parameters
59
+ )
60
+ except:
61
+ return OutputParameters()
62
+
63
+
64
+ def extract_messages_api_output_parameters(
65
+ message_response: Message,
66
+ input_parameters: InputParameters,
67
+ ) -> OutputParameters:
68
+ output = str(message_response.content[0].text)
69
+ prompt_tokens = message_response.usage.input_tokens
70
+ completion_tokens = message_response.usage.output_tokens
71
+
72
+ tools_called = None
73
+ anthropic_tool_calls = [
74
+ block
75
+ for block in message_response.content
76
+ if isinstance(block, ToolUseBlock)
77
+ ]
78
+ if anthropic_tool_calls:
79
+ tools_called = []
80
+ tool_descriptions = input_parameters.tool_descriptions or {}
81
+ for tool_call in anthropic_tool_calls:
82
+ tools_called.append(
83
+ ToolCall(
84
+ name=tool_call.name,
85
+ input_parameters=tool_call.input,
86
+ description=tool_descriptions.get(tool_call.name),
87
+ )
88
+ )
89
+ return OutputParameters(
90
+ output=output,
91
+ prompt_tokens=prompt_tokens,
92
+ completion_tokens=completion_tokens,
93
+ tools_called=tools_called,
94
+ )
@@ -0,0 +1,169 @@
1
+ from typing import Callable
2
+ from functools import wraps
3
+
4
+ from deepeval.anthropic.extractors import (
5
+ safe_extract_input_parameters,
6
+ safe_extract_output_parameters,
7
+ InputParameters,
8
+ )
9
+ from deepeval.model_integrations.utils import _update_all_attributes
10
+ from deepeval.tracing import observe
11
+ from deepeval.tracing.trace_context import current_llm_context
12
+
13
+ _ORIGINAL_METHODS = {}
14
+ _ANTHROPIC_PATCHED = False
15
+
16
+
17
+ def patch_anthropic_classes():
18
+ """
19
+ Monkey patch Anthropic resource classes directly.
20
+ """
21
+ global _ANTHROPIC_PATCHED
22
+
23
+ # Single guard - if already patched, return immediately
24
+ if _ANTHROPIC_PATCHED:
25
+ return
26
+
27
+ try:
28
+ from anthropic.resources.messages import Messages, AsyncMessages
29
+
30
+ # Store original methods before patching
31
+ if hasattr(Messages, "create"):
32
+ _ORIGINAL_METHODS["Messages.create"] = Messages.create
33
+ Messages.create = _create_sync_wrapper(Messages.create)
34
+
35
+ if hasattr(AsyncMessages, "create"):
36
+ _ORIGINAL_METHODS["AsyncMessages.create"] = AsyncMessages.create
37
+ AsyncMessages.create = _create_async_wrapper(AsyncMessages.create)
38
+
39
+ except ImportError:
40
+ pass
41
+
42
+ _ANTHROPIC_PATCHED = True
43
+
44
+
45
+ def _create_sync_wrapper(original_method):
46
+ """
47
+ Create a wrapper for sync methods - called ONCE during patching.
48
+ """
49
+
50
+ @wraps(original_method)
51
+ def method_wrapper(self, *args, **kwargs):
52
+ bound_method = original_method.__get__(self, type(self))
53
+ patched = _patch_sync_anthropic_client_method(
54
+ original_method=bound_method
55
+ )
56
+ return patched(*args, **kwargs)
57
+
58
+ return method_wrapper
59
+
60
+
61
+ def _create_async_wrapper(original_method):
62
+ """
63
+ Create a wrapper for sync methods - called ONCE during patching.
64
+ """
65
+
66
+ @wraps(original_method)
67
+ def method_wrapper(self, *args, **kwargs):
68
+ bound_method = original_method.__get__(self, type(self))
69
+ patched = _patch_async_anthropic_client_method(
70
+ original_method=bound_method
71
+ )
72
+ return patched(*args, **kwargs)
73
+
74
+ return method_wrapper
75
+
76
+
77
+ def _patch_sync_anthropic_client_method(original_method: Callable):
78
+ @wraps(original_method)
79
+ def patched_sync_anthropic_method(*args, **kwargs):
80
+ input_parameters: InputParameters = safe_extract_input_parameters(
81
+ kwargs
82
+ )
83
+ llm_context = current_llm_context.get()
84
+
85
+ @observe(
86
+ type="llm",
87
+ model=input_parameters.model,
88
+ metrics=llm_context.metrics,
89
+ metric_collection=llm_context.metric_collection,
90
+ )
91
+ def llm_generation(*args, **kwargs):
92
+ messages_api_response = original_method(*args, **kwargs)
93
+ output_parameters = safe_extract_output_parameters(
94
+ messages_api_response, input_parameters
95
+ )
96
+ _update_all_attributes(
97
+ input_parameters,
98
+ output_parameters,
99
+ llm_context.expected_tools,
100
+ llm_context.expected_output,
101
+ llm_context.context,
102
+ llm_context.retrieval_context,
103
+ )
104
+ return messages_api_response
105
+
106
+ return llm_generation(*args, **kwargs)
107
+
108
+ return patched_sync_anthropic_method
109
+
110
+
111
+ def _patch_async_anthropic_client_method(original_method: Callable):
112
+ @wraps(original_method)
113
+ async def patched_async_anthropic_method(*args, **kwargs):
114
+ input_parameters: InputParameters = safe_extract_input_parameters(
115
+ kwargs
116
+ )
117
+ llm_context = current_llm_context.get()
118
+
119
+ @observe(
120
+ type="llm",
121
+ model=input_parameters.model,
122
+ metrics=llm_context.metrics,
123
+ metric_collection=llm_context.metric_collection,
124
+ )
125
+ async def llm_generation(*args, **kwargs):
126
+ messages_api_response = await original_method(*args, **kwargs)
127
+ output_parameters = safe_extract_output_parameters(
128
+ messages_api_response, input_parameters
129
+ )
130
+ _update_all_attributes(
131
+ input_parameters,
132
+ output_parameters,
133
+ llm_context.expected_tools,
134
+ llm_context.expected_output,
135
+ llm_context.context,
136
+ llm_context.retrieval_context,
137
+ )
138
+ return messages_api_response
139
+
140
+ return await llm_generation(*args, **kwargs)
141
+
142
+ return patched_async_anthropic_method
143
+
144
+
145
+ def unpatch_anthropic_classes():
146
+ """
147
+ Restore Anthropic resource classes to their original state.
148
+ """
149
+ global _ANTHROPIC_PATCHED
150
+
151
+ # If not patched, nothing to do
152
+ if not _ANTHROPIC_PATCHED:
153
+ return
154
+
155
+ try:
156
+ from anthropic.resources.messages import Messages, AsyncMessages
157
+
158
+ # Restore original methods for Messages
159
+ if hasattr(Messages, "create"):
160
+ Messages.create = _ORIGINAL_METHODS["Messages.create"]
161
+
162
+ if hasattr(AsyncMessages, "create"):
163
+ AsyncMessages.create = _ORIGINAL_METHODS["AsyncMessages.create"]
164
+
165
+ except ImportError:
166
+ pass
167
+
168
+ # Reset the patched flag
169
+ _ANTHROPIC_PATCHED = False
@@ -0,0 +1,225 @@
1
+ from typing import Any, Iterable, List
2
+
3
+ from anthropic.types import Message
4
+
5
+ from deepeval.model_integrations.utils import compact_dump, fmt_url
6
+ from deepeval.utils import shorten
7
+
8
+
9
+ def stringify_anthropic_content(content: Any) -> str:
10
+ """
11
+ Return a short, human-readable summary string for an Anthropic-style multimodal `content` value.
12
+
13
+ This is used to populate span summaries, such as `InputParameters.input`. It never raises and
14
+ never returns huge blobs.
15
+
16
+ Notes:
17
+ - Data URIs and base64 content are redacted to "[data-uri]" or "[base64:...]".
18
+ - Output is capped via `deepeval.utils.shorten` (configurable through settings).
19
+ - Fields that are not explicitly handled are returned as size-capped JSON dumps
20
+ - This string is for display/summary only, not intended to be parsable.
21
+
22
+ Args:
23
+ content: The value of an Anthropic message `content`, may be a str or list of content blocks,
24
+ or any nested structure.
25
+
26
+ Returns:
27
+ A short, readable `str` summary.
28
+ """
29
+ if content is None:
30
+ return ""
31
+ if isinstance(content, str):
32
+ return content
33
+ if isinstance(content, (bytes, bytearray)):
34
+ return f"[bytes:{len(content)}]"
35
+
36
+ # list of content blocks for Anthropic Messages API
37
+ if isinstance(content, list):
38
+ parts: List[str] = []
39
+ for block in content:
40
+ s = stringify_anthropic_content(block)
41
+ if s:
42
+ parts.append(s)
43
+ return "\n".join(parts)
44
+
45
+ # dict shapes for Anthropic Messages API
46
+ if isinstance(content, dict):
47
+ t = content.get("type")
48
+
49
+ # Text block
50
+ if t == "text":
51
+ return str(content.get("text", ""))
52
+
53
+ # Image block
54
+ if t == "image":
55
+ source = content.get("source", {})
56
+ source_type = source.get("type")
57
+ if source_type == "base64":
58
+ media_type = source.get("media_type", "unknown")
59
+ data = source.get("data", "")
60
+ data_preview = data[:20] if data else ""
61
+ return f"[image:{media_type}:base64:{data_preview}...]"
62
+ elif source_type == "url":
63
+ url = source.get("url", "")
64
+ return f"[image:{fmt_url(url)}]"
65
+ else:
66
+ return f"[image:{source_type or 'unknown'}]"
67
+
68
+ # Tool use block (in assistant messages)
69
+ if t == "tool_use":
70
+ tool_name = content.get("name", "unknown")
71
+ tool_id = content.get("id", "")
72
+ tool_input = content.get("input", {})
73
+ input_str = compact_dump(tool_input) if tool_input else ""
74
+ return f"[tool_use:{tool_name}:{tool_id}:{input_str}]"
75
+
76
+ # Tool result block (in user messages)
77
+ if t == "tool_result":
78
+ tool_id = content.get("tool_use_id", "")
79
+ tool_content = content.get("content")
80
+ content_str = (
81
+ stringify_anthropic_content(tool_content)
82
+ if tool_content
83
+ else ""
84
+ )
85
+ is_error = content.get("is_error", False)
86
+ error_flag = ":error" if is_error else ""
87
+ return f"[tool_result:{tool_id}{error_flag}:{content_str}]"
88
+
89
+ # Document block (for PDFs and other documents)
90
+ if t == "document":
91
+ source = content.get("source", {})
92
+ source_type = source.get("type")
93
+ if source_type == "base64":
94
+ media_type = source.get("media_type", "unknown")
95
+ return f"[document:{media_type}:base64]"
96
+ elif source_type == "url":
97
+ url = source.get("url", "")
98
+ return f"[document:{fmt_url(url)}]"
99
+ else:
100
+ return f"[document:{source_type or 'unknown'}]"
101
+
102
+ # Thinking block (for extended thinking models)
103
+ if t == "thinking":
104
+ thinking_text = content.get("thinking", "")
105
+ return {
106
+ "role": "thinking",
107
+ "content": shorten(thinking_text, max_len=100),
108
+ }
109
+
110
+ # readability for other block types we don't currently handle
111
+ if t:
112
+ return f"[{t}]"
113
+
114
+ # unknown dicts and types returned as shortened JSON
115
+ return compact_dump(content)
116
+
117
+
118
+ def render_messages_anthropic(
119
+ messages: Iterable[Message],
120
+ ):
121
+ """
122
+ Extracts and normalizes tool calls and tool results from Anthropic API messages
123
+ for observability/logging purposes.
124
+
125
+ Args:
126
+ messages: Iterable of message dictionaries in Anthropic API format
127
+
128
+ Returns:
129
+ List of normalized message objects suitable for logging/observability
130
+ """
131
+ messages_list = []
132
+
133
+ for message in messages:
134
+ role = message.get("role")
135
+ content = message.get("content")
136
+
137
+ if role == "assistant":
138
+ if isinstance(content, str):
139
+ messages_list.append(
140
+ {
141
+ "role": role,
142
+ "content": content,
143
+ }
144
+ )
145
+ elif isinstance(content, list):
146
+ for block in content:
147
+ block_type = block.get("type")
148
+ if block_type == "text":
149
+ messages_list.append(
150
+ {
151
+ "role": role,
152
+ "content": block.get("text", ""),
153
+ }
154
+ )
155
+ elif block_type == "tool_use":
156
+ messages_list.append(
157
+ {
158
+ "id": block.get("id", ""),
159
+ "call_id": block.get("id", ""),
160
+ "name": block.get("name", ""),
161
+ "type": "function",
162
+ "arguments": block.get("input", {}),
163
+ }
164
+ )
165
+
166
+ elif role == "user":
167
+ if isinstance(content, str):
168
+ messages_list.append(
169
+ {
170
+ "role": role,
171
+ "content": content,
172
+ }
173
+ )
174
+ elif isinstance(content, list):
175
+ for block in content:
176
+ block_type = block.get("type")
177
+ if block_type == "text":
178
+ messages_list.append(
179
+ {
180
+ "role": role,
181
+ "content": block.get("text", ""),
182
+ }
183
+ )
184
+ elif block_type == "image":
185
+ messages_list.append(
186
+ {
187
+ "role": role,
188
+ "content": "[Image content]",
189
+ "image_source": block.get("source", {}),
190
+ }
191
+ )
192
+ elif block_type == "tool_result":
193
+ tool_content = block.get("content", "")
194
+ if isinstance(tool_content, list):
195
+ output_parts = []
196
+ for tool_content_block in tool_content:
197
+ if isinstance(tool_content_block, dict):
198
+ if tool_content_block.get("type") == "text":
199
+ output_parts.append(
200
+ tool_content_block.get("text", "")
201
+ )
202
+ else:
203
+ output_parts.append(str(tool_content_block))
204
+ output = "\n".join(output_parts)
205
+ else:
206
+ output = tool_content
207
+
208
+ messages_list.append(
209
+ {
210
+ "call_id": block.get("tool_use_id", ""),
211
+ "type": "tool",
212
+ "output": output,
213
+ "is_error": block.get("is_error", False),
214
+ }
215
+ )
216
+
217
+ elif role == "system":
218
+ messages_list.append(
219
+ {
220
+ "role": role,
221
+ "content": content,
222
+ }
223
+ )
224
+
225
+ return messages_list
@@ -1,6 +1,10 @@
1
+ import logging
2
+
1
3
  from typing import List, Optional, Dict, Union
2
4
  from tqdm import tqdm
3
5
 
6
+ from deepeval.config.settings import get_settings
7
+ from deepeval.errors import DeepEvalError
4
8
  from deepeval.dataset import Golden
5
9
  from deepeval.benchmarks.base_benchmark import (
6
10
  DeepEvalBaseBenchmark,
@@ -17,6 +21,8 @@ from deepeval.benchmarks.schema import (
17
21
  )
18
22
  from deepeval.telemetry import capture_benchmark_run
19
23
 
24
+
25
+ logger = logging.getLogger(__name__)
20
26
  DELIMITER = ","
21
27
 
22
28
 
@@ -164,7 +170,7 @@ class DROP(DeepEvalBaseBenchmark):
164
170
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
165
171
  # Define prompt template
166
172
  assert (
167
- self.shots_dataset != None
173
+ self.shots_dataset is not None
168
174
  ), "Example dataset is empty. Call load_benchmark."
169
175
  prompt: dict = DROPTemplate.generate_output(
170
176
  train_set=self.shots_dataset,
@@ -206,7 +212,7 @@ class DROP(DeepEvalBaseBenchmark):
206
212
  ) -> List[Dict]:
207
213
  # Define prompt template
208
214
  assert (
209
- self.shots_dataset != None
215
+ self.shots_dataset is not None
210
216
  ), "Example dataset is empty. Call load_benchmark."
211
217
 
212
218
  prompts = []
@@ -215,7 +221,6 @@ class DROP(DeepEvalBaseBenchmark):
215
221
  prompt: dict = DROPTemplate.generate_output(
216
222
  train_set=self.shots_dataset,
217
223
  input=golden.input,
218
- type=golden.context[0],
219
224
  n_shots=self.n_shots,
220
225
  )
221
226
  prompts.append(prompt)
@@ -228,23 +233,44 @@ class DROP(DeepEvalBaseBenchmark):
228
233
  schema = DROPStringSchema
229
234
  schemas.append(schema)
230
235
 
236
+ effective_batch_size = len(goldens)
237
+ model_name = getattr(
238
+ model, "get_model_name", lambda: type(model).__name__
239
+ )()
240
+
231
241
  try:
232
242
  responses: List[
233
243
  Union[DROPNumberSchema, DROPDateSchema, DROPStringSchema]
234
244
  ] = model.batch_generate(prompts=prompts, schemas=schemas)
235
245
  predictions = [str(res.answer) for res in responses]
236
- except TypeError:
237
- prompts = [
238
- prompt
239
- + "Output should be of type {type}. No explanation needed.".format(
240
- type=type
241
- )
242
- for prompt in prompts
243
- ]
244
- predictions = model.batch_generate(prompts)
246
+ except (AttributeError, NotImplementedError) as e:
247
+ logger.error(
248
+ "DROP: model %s does not implement batch_generate. Batch evaluation "
249
+ "(effective batch_size=%s) requires a batch-capable model. "
250
+ "Use a model that implements batch_generate(prompts, schemas) or run with batch_size=0/None.",
251
+ model_name,
252
+ effective_batch_size,
253
+ exc_info=get_settings().DEEPEVAL_LOG_STACK_TRACES,
254
+ )
255
+ raise DeepEvalError(
256
+ "Model does not implement batch_generate. Use a batch-capable model or set batch_size=0/None."
257
+ ) from e
258
+
259
+ except TypeError as e:
260
+ logger.error(
261
+ "DROP: model %s does not support schema-aware batch generation "
262
+ "(batch_generate(prompts, schemas)). DROP requires structured outputs "
263
+ "for number/date/span. Use a model that supports schemas or run with batch_size=0/None.",
264
+ model_name,
265
+ exc_info=get_settings().DEEPEVAL_LOG_STACK_TRACES,
266
+ )
267
+ raise DeepEvalError(
268
+ "Model does not support schema-aware batch generation required by DROP. "
269
+ "Use batch_generate(prompts, schemas) or set batch_size=0/None."
270
+ ) from e
245
271
 
246
- if len(predictions) is not len(goldens):
247
- raise ValueError(
272
+ if len(predictions) != effective_batch_size:
273
+ raise DeepEvalError(
248
274
  "Custom `batch_generate` method did not return the same number of generations as the number of prompts."
249
275
  )
250
276
 
@@ -1,8 +1,8 @@
1
- from pydantic.config import ConfigDict
2
1
  from deepeval.benchmarks.base_benchmark import (
3
2
  DeepEvalBaseBenchmark,
4
3
  DeepEvalBaseBenchmarkResult,
5
4
  )
5
+ from deepeval.utils import make_model_config
6
6
  from typing import List, Optional, Dict, Any, Tuple
7
7
  from tqdm import tqdm
8
8
  import re
@@ -19,7 +19,7 @@ from deepeval.telemetry import capture_benchmark_run
19
19
 
20
20
 
21
21
  class IFEvalResult(DeepEvalBaseBenchmarkResult):
22
- model_config = ConfigDict(arbitrary_types_allowed=True)
22
+ model_config = make_model_config(arbitrary_types_allowed=True)
23
23
  instruction_breakdown: dict[str, Any]
24
24
  predictions: "pd.DataFrame"
25
25
 
@@ -1,9 +1,11 @@
1
- from pydantic import BaseModel, ConfigDict
1
+ from pydantic import BaseModel
2
2
  from typing import Any, Optional
3
3
 
4
+ from deepeval.utils import make_model_config
5
+
4
6
 
5
7
  class ApiResponse(BaseModel):
6
- model_config = ConfigDict(extra="ignore")
8
+ model_config = make_model_config(extra="ignore")
7
9
 
8
10
  success: bool
9
11
  data: Optional[Any] = None