deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. deepeval/__init__.py +0 -4
  2. deepeval/_version.py +1 -1
  3. deepeval/anthropic/__init__.py +19 -0
  4. deepeval/anthropic/extractors.py +94 -0
  5. deepeval/anthropic/patch.py +169 -0
  6. deepeval/anthropic/utils.py +225 -0
  7. deepeval/benchmarks/drop/drop.py +40 -14
  8. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  9. deepeval/cli/main.py +7 -0
  10. deepeval/confident/api.py +6 -1
  11. deepeval/confident/types.py +4 -2
  12. deepeval/config/settings.py +159 -11
  13. deepeval/config/settings_manager.py +4 -0
  14. deepeval/evaluate/compare.py +215 -4
  15. deepeval/evaluate/types.py +6 -0
  16. deepeval/evaluate/utils.py +30 -0
  17. deepeval/integrations/crewai/handler.py +36 -0
  18. deepeval/integrations/langchain/callback.py +27 -2
  19. deepeval/integrations/llama_index/handler.py +58 -4
  20. deepeval/integrations/llama_index/utils.py +24 -0
  21. deepeval/key_handler.py +1 -0
  22. deepeval/metrics/__init__.py +5 -0
  23. deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
  24. deepeval/metrics/arena_g_eval/utils.py +5 -5
  25. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
  26. deepeval/metrics/exact_match/__init__.py +0 -0
  27. deepeval/metrics/exact_match/exact_match.py +94 -0
  28. deepeval/metrics/g_eval/g_eval.py +5 -1
  29. deepeval/metrics/g_eval/utils.py +1 -1
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
  31. deepeval/metrics/pattern_match/__init__.py +0 -0
  32. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  33. deepeval/metrics/task_completion/task_completion.py +9 -2
  34. deepeval/metrics/utils.py +1 -1
  35. deepeval/model_integrations/__init__.py +0 -0
  36. deepeval/model_integrations/utils.py +116 -0
  37. deepeval/models/base_model.py +3 -1
  38. deepeval/models/llms/gemini_model.py +27 -5
  39. deepeval/openai/__init__.py +3 -1
  40. deepeval/openai/extractors.py +2 -2
  41. deepeval/openai/utils.py +7 -31
  42. deepeval/openai_agents/callback_handler.py +12 -3
  43. deepeval/prompt/api.py +11 -10
  44. deepeval/prompt/prompt.py +27 -15
  45. deepeval/simulator/template.py +1 -1
  46. deepeval/telemetry.py +3 -3
  47. deepeval/test_case/__init__.py +2 -1
  48. deepeval/test_case/arena_test_case.py +15 -4
  49. deepeval/test_case/llm_test_case.py +3 -2
  50. deepeval/test_case/mllm_test_case.py +45 -22
  51. deepeval/test_run/api.py +3 -2
  52. deepeval/test_run/cache.py +35 -13
  53. deepeval/test_run/hyperparameters.py +5 -1
  54. deepeval/test_run/test_run.py +52 -14
  55. deepeval/tracing/api.py +11 -10
  56. deepeval/tracing/otel/exporter.py +11 -0
  57. deepeval/tracing/patchers.py +102 -1
  58. deepeval/tracing/trace_context.py +13 -4
  59. deepeval/tracing/tracing.py +11 -2
  60. deepeval/tracing/types.py +8 -8
  61. deepeval/tracing/utils.py +9 -0
  62. deepeval/utils.py +48 -2
  63. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
  64. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
  65. /deepeval/{openai → model_integrations}/types.py +0 -0
  66. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
  67. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
  68. {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,116 @@
1
+ import json
2
+ import uuid
3
+ from typing import Any, List, Optional
4
+
5
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
6
+ from deepeval.test_case.llm_test_case import ToolCall
7
+ from deepeval.tracing.context import (
8
+ current_span_context,
9
+ current_trace_context,
10
+ update_current_span,
11
+ update_llm_span,
12
+ )
13
+ from deepeval.tracing.trace_context import current_llm_context
14
+ from deepeval.tracing.types import ToolSpan, TraceSpanStatus
15
+ from deepeval.utils import shorten, len_long
16
+
17
+
18
+ def _update_all_attributes(
19
+ input_parameters: InputParameters,
20
+ output_parameters: OutputParameters,
21
+ expected_tools: List[ToolCall],
22
+ expected_output: str,
23
+ context: List[str],
24
+ retrieval_context: List[str],
25
+ ):
26
+ """Update span and trace attributes with input/output parameters."""
27
+ update_current_span(
28
+ input=input_parameters.input or input_parameters.messages or "NA",
29
+ output=output_parameters.output or "NA",
30
+ tools_called=output_parameters.tools_called,
31
+ # attributes to be added
32
+ expected_output=expected_output,
33
+ expected_tools=expected_tools,
34
+ context=context,
35
+ retrieval_context=retrieval_context,
36
+ )
37
+
38
+ llm_context = current_llm_context.get()
39
+
40
+ update_llm_span(
41
+ input_token_count=output_parameters.prompt_tokens,
42
+ output_token_count=output_parameters.completion_tokens,
43
+ prompt=llm_context.prompt,
44
+ )
45
+
46
+ if output_parameters.tools_called:
47
+ create_child_tool_spans(output_parameters)
48
+
49
+ __update_input_and_output_of_current_trace(
50
+ input_parameters, output_parameters
51
+ )
52
+
53
+
54
+ def __update_input_and_output_of_current_trace(
55
+ input_parameters: InputParameters, output_parameters: OutputParameters
56
+ ):
57
+
58
+ current_trace = current_trace_context.get()
59
+ if current_trace:
60
+ if current_trace.input is None:
61
+ current_trace.input = (
62
+ input_parameters.input or input_parameters.messages
63
+ )
64
+ if current_trace.output is None:
65
+ current_trace.output = output_parameters.output
66
+
67
+ return
68
+
69
+
70
+ def create_child_tool_spans(output_parameters: OutputParameters):
71
+ if output_parameters.tools_called is None:
72
+ return
73
+
74
+ current_span = current_span_context.get()
75
+ for tool_called in output_parameters.tools_called:
76
+ tool_span = ToolSpan(
77
+ **{
78
+ "uuid": str(uuid.uuid4()),
79
+ "trace_uuid": current_span.trace_uuid,
80
+ "parent_uuid": current_span.uuid,
81
+ "start_time": current_span.start_time,
82
+ "end_time": current_span.start_time,
83
+ "status": TraceSpanStatus.SUCCESS,
84
+ "children": [],
85
+ "name": tool_called.name,
86
+ "input": tool_called.input_parameters,
87
+ "output": None,
88
+ "metrics": None,
89
+ "description": tool_called.description,
90
+ }
91
+ )
92
+ current_span.children.append(tool_span)
93
+
94
+
95
+ _URL_MAX = 200
96
+ _JSON_MAX = max(
97
+ len_long(), 400
98
+ ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
99
+
100
+
101
+ def compact_dump(value: Any) -> str:
102
+ try:
103
+ dumped = json.dumps(
104
+ value, ensure_ascii=False, default=str, separators=(",", ":")
105
+ )
106
+ except Exception:
107
+ dumped = repr(value)
108
+ return shorten(dumped, max_len=_JSON_MAX)
109
+
110
+
111
+ def fmt_url(url: Optional[str]) -> str:
112
+ if not url:
113
+ return ""
114
+ if url.startswith("data:"):
115
+ return "[data-uri]"
116
+ return shorten(url, max_len=_URL_MAX)
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
68
68
  Returns:
69
69
  A list of strings.
70
70
  """
71
- raise AttributeError
71
+ raise NotImplementedError(
72
+ "batch_generate is not implemented for this model"
73
+ )
72
74
 
73
75
  @abstractmethod
74
76
  def get_model_name(self, *args, **kwargs) -> str:
@@ -1,7 +1,6 @@
1
1
  from pydantic import BaseModel
2
- from google.genai import types
2
+ from google.genai import types, Client
3
3
  from typing import Optional, Dict
4
- from google import genai
5
4
 
6
5
  from deepeval.models.retry_policy import (
7
6
  create_retry_decorator,
@@ -9,7 +8,8 @@ from deepeval.models.retry_policy import (
9
8
  from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
10
9
  from deepeval.models.base_model import DeepEvalBaseLLM
11
10
  from deepeval.constants import ProviderSlug as PS
12
-
11
+ from google.oauth2 import service_account
12
+ import json
13
13
 
14
14
  default_gemini_model = "gemini-1.5-pro"
15
15
 
@@ -52,6 +52,7 @@ class GeminiModel(DeepEvalBaseLLM):
52
52
  api_key: Optional[str] = None,
53
53
  project: Optional[str] = None,
54
54
  location: Optional[str] = None,
55
+ service_account_key: Optional[Dict[str, str]] = None,
55
56
  temperature: float = 0,
56
57
  generation_kwargs: Optional[Dict] = None,
57
58
  **kwargs,
@@ -75,6 +76,17 @@ class GeminiModel(DeepEvalBaseLLM):
75
76
  self.use_vertexai = KEY_FILE_HANDLER.fetch_data(
76
77
  ModelKeyValues.GOOGLE_GENAI_USE_VERTEXAI
77
78
  )
79
+ if service_account_key:
80
+ self.service_account_key = service_account_key
81
+ else:
82
+ service_account_key_data = KEY_FILE_HANDLER.fetch_data(
83
+ ModelKeyValues.GOOGLE_SERVICE_ACCOUNT_KEY
84
+ )
85
+ if service_account_key_data is None:
86
+ self.service_account_key = None
87
+ elif isinstance(service_account_key_data, str):
88
+ self.service_account_key = json.loads(service_account_key_data)
89
+
78
90
  if temperature < 0:
79
91
  raise ValueError("Temperature must be >= 0.")
80
92
  self.temperature = temperature
@@ -117,10 +129,20 @@ class GeminiModel(DeepEvalBaseLLM):
117
129
  )
118
130
 
119
131
  # Create client for Vertex AI
120
- self.client = genai.Client(
132
+ self.client = Client(
121
133
  vertexai=True,
122
134
  project=self.project,
123
135
  location=self.location,
136
+ credentials=(
137
+ service_account.Credentials.from_service_account_info(
138
+ self.service_account_key,
139
+ scopes=[
140
+ "https://www.googleapis.com/auth/cloud-platform"
141
+ ],
142
+ )
143
+ if self.service_account_key
144
+ else None
145
+ ),
124
146
  **self.kwargs,
125
147
  )
126
148
  else:
@@ -130,7 +152,7 @@ class GeminiModel(DeepEvalBaseLLM):
130
152
  "or set it in your DeepEval configuration."
131
153
  )
132
154
  # Create client for Gemini API
133
- self.client = genai.Client(api_key=self.api_key, **self.kwargs)
155
+ self.client = Client(api_key=self.api_key, **self.kwargs)
134
156
 
135
157
  # Configure default model generation settings
136
158
  self.model_safety_settings = [
@@ -15,5 +15,7 @@ except ImportError:
15
15
 
16
16
  if OpenAI or AsyncOpenAI:
17
17
  from deepeval.openai.patch import patch_openai_classes
18
+ from deepeval.telemetry import capture_tracing_integration
18
19
 
19
- patch_openai_classes()
20
+ with capture_tracing_integration("openai"):
21
+ patch_openai_classes()
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
4
4
  from openai.types.responses import Response
5
5
 
6
6
  from deepeval.test_case.llm_test_case import ToolCall
7
+
8
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
7
9
  from deepeval.openai.utils import (
8
10
  render_response_input,
9
11
  stringify_multimodal_content,
10
12
  render_messages,
11
13
  )
12
- from deepeval.openai.types import InputParameters, OutputParameters
13
- from deepeval.tracing.types import Message
14
14
 
15
15
 
16
16
  # guarding against errors to be compatible with legacy APIs
deepeval/openai/utils.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import uuid
3
- from typing import Any, Dict, List, Optional, Iterable
3
+ from typing import Any, Dict, List, Iterable
4
4
 
5
5
  from openai.types.chat.chat_completion_message_param import (
6
6
  ChatCompletionMessageParam,
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
8
8
 
9
9
  from deepeval.tracing.types import ToolSpan, TraceSpanStatus
10
10
  from deepeval.tracing.context import current_span_context
11
- from deepeval.utils import shorten, len_long
12
- from deepeval.openai.types import OutputParameters
13
-
14
-
15
- _URL_MAX = 200
16
- _JSON_MAX = max(
17
- len_long(), 400
18
- ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
19
-
20
-
21
- def _compact_dump(value: Any) -> str:
22
- try:
23
- dumped = json.dumps(
24
- value, ensure_ascii=False, default=str, separators=(",", ":")
25
- )
26
- except Exception:
27
- dumped = repr(value)
28
- return shorten(dumped, max_len=_JSON_MAX)
29
-
30
-
31
- def _fmt_url(url: Optional[str]) -> str:
32
- if not url:
33
- return ""
34
- if url.startswith("data:"):
35
- return "[data-uri]"
36
- return shorten(url, max_len=_URL_MAX)
11
+ from deepeval.model_integrations.types import OutputParameters
12
+ from deepeval.model_integrations.utils import compact_dump, fmt_url
37
13
 
38
14
 
39
15
  def create_child_tool_spans(output_parameters: OutputParameters):
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
111
87
  url = image_url
112
88
  else:
113
89
  url = (image_url or {}).get("url") or content.get("url")
114
- return f"[image:{_fmt_url(url)}]"
90
+ return f"[image:{fmt_url(url)}]"
115
91
 
116
92
  # Responses API variants
117
93
  if t == "input_text":
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
122
98
  url = image_url
123
99
  else:
124
100
  url = (image_url or {}).get("url") or content.get("url")
125
- return f"[image:{_fmt_url(url)}]"
101
+ return f"[image:{fmt_url(url)}]"
126
102
 
127
103
  # readability for other input_* types we don't currently handle
128
104
  if t and t.startswith("input_"):
129
105
  return f"[{t}]"
130
106
 
131
107
  # unknown dicts and types returned as shortened JSON
132
- return _compact_dump(content)
108
+ return compact_dump(content)
133
109
 
134
110
 
135
111
  def render_messages(
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
228
204
  lines.append(f"{prefix}{key}:")
229
205
  lines.append(_render_content(value, indent + 1))
230
206
  elif isinstance(value, list):
231
- lines.append(f"{prefix}{key}: {_compact_dump(value)}")
207
+ lines.append(f"{prefix}{key}: {compact_dump(value)}")
232
208
  else:
233
209
  lines.append(f"{prefix}{key}: {value}")
234
210
 
@@ -1,13 +1,21 @@
1
+ from time import perf_counter
2
+
1
3
  from deepeval.tracing.tracing import (
2
4
  Observer,
3
5
  current_span_context,
4
6
  trace_manager,
5
7
  )
6
- from deepeval.openai_agents.extractors import *
8
+ from deepeval.openai_agents.extractors import (
9
+ update_span_properties,
10
+ update_trace_properties_from_span_data,
11
+ )
7
12
  from deepeval.tracing.context import current_trace_context
8
13
  from deepeval.tracing.utils import make_json_serializable
9
- from time import perf_counter
10
- from deepeval.tracing.types import TraceSpanStatus
14
+ from deepeval.tracing.types import (
15
+ BaseSpan,
16
+ LlmSpan,
17
+ TraceSpanStatus,
18
+ )
11
19
 
12
20
  try:
13
21
  from agents.tracing import Span, Trace, TracingProcessor
@@ -18,6 +26,7 @@ try:
18
26
  GenerationSpanData,
19
27
  GuardrailSpanData,
20
28
  HandoffSpanData,
29
+ MCPListToolsSpanData,
21
30
  ResponseSpanData,
22
31
  SpanData,
23
32
  )
deepeval/prompt/api.py CHANGED
@@ -1,8 +1,10 @@
1
- from pydantic import BaseModel, Field, AliasChoices
1
+ from pydantic import BaseModel, Field, AliasChoices, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
4
  from pydantic import TypeAdapter
5
5
 
6
+ from deepeval.utils import make_model_config
7
+
6
8
  ###################################
7
9
  # Model Settings
8
10
  ###################################
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
92
94
 
93
95
 
94
96
  class OutputSchemaField(BaseModel):
97
+ model_config = make_model_config(use_enum_values=True)
98
+
95
99
  id: str
96
100
  type: SchemaDataType
97
101
  name: str
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
102
106
  validation_alias=AliasChoices("parent_id", "parentId"),
103
107
  )
104
108
 
105
- class Config:
106
- use_enum_values = True
107
-
108
109
 
109
110
  class OutputSchema(BaseModel):
110
111
  fields: Optional[List[OutputSchemaField]] = None
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
187
188
 
188
189
 
189
190
  class PromptPushRequest(BaseModel):
191
+ model_config = make_model_config(use_enum_values=True)
192
+
193
+ model_config = ConfigDict(use_enum_values=True)
194
+
190
195
  alias: str
191
196
  text: Optional[str] = None
192
197
  messages: Optional[List[PromptMessage]] = None
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
203
208
  default=None, serialization_alias="outputType"
204
209
  )
205
210
 
206
- class Config:
207
- use_enum_values = True
208
-
209
211
 
210
212
  class PromptUpdateRequest(BaseModel):
213
+ model_config = make_model_config(use_enum_values=True)
214
+
211
215
  text: Optional[str] = None
212
216
  messages: Optional[List[PromptMessage]] = None
213
217
  interpolation_type: PromptInterpolationType = Field(
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
223
227
  default=None, serialization_alias="outputType"
224
228
  )
225
229
 
226
- class Config:
227
- use_enum_values = True
228
-
229
230
 
230
231
  class PromptApi(BaseModel):
231
232
  id: str
deepeval/prompt/prompt.py CHANGED
@@ -1,3 +1,8 @@
1
+ import logging
2
+ import time
3
+ import json
4
+ import os
5
+
1
6
  from enum import Enum
2
7
  from typing import Optional, List, Dict, Type, Literal
3
8
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
@@ -7,9 +12,10 @@ import json
7
12
  import os
8
13
  from pydantic import BaseModel, ValidationError
9
14
  import asyncio
10
- import portalocker
11
15
  import threading
12
16
 
17
+ from deepeval.utils import make_model_config, is_read_only_env
18
+
13
19
  from deepeval.prompt.api import (
14
20
  PromptHttpResponse,
15
21
  PromptMessage,
@@ -22,9 +28,6 @@ from deepeval.prompt.api import (
22
28
  ModelSettings,
23
29
  OutputSchema,
24
30
  OutputType,
25
- ReasoningEffort,
26
- Verbosity,
27
- ModelProvider,
28
31
  )
29
32
  from deepeval.prompt.utils import (
30
33
  interpolate_text,
@@ -34,6 +37,18 @@ from deepeval.prompt.utils import (
34
37
  from deepeval.confident.api import Api, Endpoints, HttpMethods
35
38
  from deepeval.constants import HIDDEN_DIR
36
39
 
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ portalocker = None
44
+ if not is_read_only_env():
45
+ try:
46
+ import portalocker
47
+ except Exception as e:
48
+ logger.warning("failed to import portalocker: %s", e)
49
+ else:
50
+ logger.warning("READ_ONLY filesystem: skipping disk cache for prompts.")
51
+
37
52
  CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
38
53
  VERSION_CACHE_KEY = "version"
39
54
  LABEL_CACHE_KEY = "label"
@@ -77,6 +92,8 @@ class CustomEncoder(json.JSONEncoder):
77
92
 
78
93
 
79
94
  class CachedPrompt(BaseModel):
95
+ model_config = make_model_config(use_enum_values=True)
96
+
80
97
  alias: str
81
98
  version: str
82
99
  label: Optional[str] = None
@@ -89,9 +106,6 @@ class CachedPrompt(BaseModel):
89
106
  output_type: Optional[OutputType]
90
107
  output_schema: Optional[OutputSchema]
91
108
 
92
- class Config:
93
- use_enum_values = True
94
-
95
109
 
96
110
  class Prompt:
97
111
 
@@ -164,7 +178,7 @@ class Prompt:
164
178
  content = f.read()
165
179
  try:
166
180
  data = json.loads(content)
167
- except:
181
+ except (json.JSONDecodeError, TypeError):
168
182
  self.text_template = content
169
183
  return content
170
184
 
@@ -202,7 +216,6 @@ class Prompt:
202
216
  "Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
203
217
  )
204
218
 
205
- print("@@@@@")
206
219
  return interpolate_text(interpolation_type, text_template, **kwargs)
207
220
 
208
221
  elif prompt_type == PromptType.LIST:
@@ -247,7 +260,7 @@ class Prompt:
247
260
  version: Optional[str] = None,
248
261
  label: Optional[str] = None,
249
262
  ) -> Optional[CachedPrompt]:
250
- if not os.path.exists(CACHE_FILE_NAME):
263
+ if portalocker is None or not os.path.exists(CACHE_FILE_NAME):
251
264
  return None
252
265
 
253
266
  try:
@@ -295,13 +308,12 @@ class Prompt:
295
308
  output_type: Optional[OutputType] = None,
296
309
  output_schema: Optional[OutputSchema] = None,
297
310
  ):
298
- if not self.alias:
311
+ if portalocker is None or not self.alias:
299
312
  return
300
313
 
301
- # Ensure directory exists
302
- os.makedirs(HIDDEN_DIR, exist_ok=True)
303
-
304
314
  try:
315
+ # Ensure directory exists
316
+ os.makedirs(HIDDEN_DIR, exist_ok=True)
305
317
  # Use r+ mode if file exists, w mode if it doesn't
306
318
  mode = "r+" if os.path.exists(CACHE_FILE_NAME) else "w"
307
319
 
@@ -480,7 +492,7 @@ class Prompt:
480
492
  cached_prompt.output_schema
481
493
  )
482
494
  return
483
- except:
495
+ except Exception:
484
496
  pass
485
497
 
486
498
  api = Api()
@@ -112,7 +112,7 @@ class ConversationSimulatorTemplate:
112
112
  ]
113
113
  Example JSON Output:
114
114
  {{
115
- "is_complete": False,
115
+ "is_complete": false,
116
116
  "reason": "The assistant explained how to forget password but ahas not confirmed that the user successfully set a new password."
117
117
  }}
118
118
 
deepeval/telemetry.py CHANGED
@@ -3,12 +3,12 @@ import logging
3
3
  import os
4
4
  import socket
5
5
  import sys
6
- from threading import Event
7
6
  import uuid
8
7
  import sentry_sdk
9
8
  from enum import Enum
10
9
  from typing import List, Dict
11
10
  import requests
11
+ from deepeval.config.settings import get_settings
12
12
  from deepeval.constants import LOGIN_PROMPT, HIDDEN_DIR, KEY_FILE
13
13
  from posthog import Posthog
14
14
 
@@ -34,7 +34,7 @@ TELEMETRY_PATH = os.path.join(HIDDEN_DIR, TELEMETRY_DATA_FILE)
34
34
 
35
35
 
36
36
  def telemetry_opt_out():
37
- return os.getenv("DEEPEVAL_TELEMETRY_OPT_OUT") == "1"
37
+ return get_settings().DEEPEVAL_TELEMETRY_OPT_OUT
38
38
 
39
39
 
40
40
  def blocked_by_firewall():
@@ -131,7 +131,7 @@ if not telemetry_opt_out():
131
131
 
132
132
 
133
133
  if (
134
- os.getenv("ERROR_REPORTING") == "1"
134
+ get_settings().ERROR_REPORTING
135
135
  and not blocked_by_firewall()
136
136
  and not telemetry_opt_out()
137
137
  ):
@@ -10,7 +10,7 @@ from .conversational_test_case import (
10
10
  TurnParams,
11
11
  )
12
12
  from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
13
- from .arena_test_case import ArenaTestCase
13
+ from .arena_test_case import ArenaTestCase, Contestant
14
14
  from .mcp import (
15
15
  MCPServer,
16
16
  MCPPromptCall,
@@ -35,4 +35,5 @@ __all__ = [
35
35
  "MLLMTestCaseParams",
36
36
  "MLLMImage",
37
37
  "ArenaTestCase",
38
+ "Contestant",
38
39
  ]
@@ -1,20 +1,31 @@
1
+ from typing import List, Dict, Optional, Union
1
2
  from dataclasses import dataclass
2
- from typing import List, Dict
3
+ from pydantic import BaseModel
4
+
3
5
  from deepeval.test_case import (
4
6
  LLMTestCase,
5
7
  )
8
+ from deepeval.prompt import Prompt
9
+
10
+
11
+ class Contestant(BaseModel):
12
+ name: str
13
+ test_case: LLMTestCase
14
+ hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None
15
+
16
+ model_config = {"arbitrary_types_allowed": True}
6
17
 
7
18
 
8
19
  @dataclass
9
20
  class ArenaTestCase:
10
- contestants: Dict[str, LLMTestCase]
21
+ contestants: List[Contestant]
11
22
 
12
23
  def __post_init__(self):
13
- contestant_names = list(self.contestants.keys())
24
+ contestant_names = [contestant.name for contestant in self.contestants]
14
25
  if len(contestant_names) != len(set(contestant_names)):
15
26
  raise ValueError("All contestant names must be unique.")
16
27
 
17
- cases = list(self.contestants.values())
28
+ cases = [contestant.test_case for contestant in self.contestants]
18
29
  ref_input = cases[0].input
19
30
  for case in cases[1:]:
20
31
  if case.input != ref_input:
@@ -1,5 +1,4 @@
1
1
  from pydantic import (
2
- ConfigDict,
3
2
  Field,
4
3
  BaseModel,
5
4
  model_validator,
@@ -11,6 +10,8 @@ from enum import Enum
11
10
  import json
12
11
  import uuid
13
12
 
13
+ from deepeval.utils import make_model_config
14
+
14
15
  from deepeval.test_case.mcp import (
15
16
  MCPServer,
16
17
  MCPPromptCall,
@@ -156,7 +157,7 @@ class ToolCall(BaseModel):
156
157
 
157
158
 
158
159
  class LLMTestCase(BaseModel):
159
- model_config = ConfigDict(extra="ignore")
160
+ model_config = make_model_config(extra="ignore")
160
161
 
161
162
  input: str
162
163
  actual_output: Optional[str] = Field(