deepeval 3.6.9__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +154 -11
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/integrations/crewai/handler.py +36 -0
  12. deepeval/integrations/langchain/callback.py +27 -2
  13. deepeval/integrations/llama_index/handler.py +58 -4
  14. deepeval/integrations/llama_index/utils.py +24 -0
  15. deepeval/metrics/__init__.py +5 -0
  16. deepeval/metrics/exact_match/__init__.py +0 -0
  17. deepeval/metrics/exact_match/exact_match.py +94 -0
  18. deepeval/metrics/pattern_match/__init__.py +0 -0
  19. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  20. deepeval/metrics/task_completion/task_completion.py +9 -2
  21. deepeval/model_integrations/__init__.py +0 -0
  22. deepeval/model_integrations/utils.py +116 -0
  23. deepeval/models/base_model.py +3 -1
  24. deepeval/openai/__init__.py +3 -1
  25. deepeval/openai/extractors.py +2 -2
  26. deepeval/openai/utils.py +7 -31
  27. deepeval/prompt/api.py +11 -10
  28. deepeval/prompt/prompt.py +5 -4
  29. deepeval/telemetry.py +3 -3
  30. deepeval/test_case/llm_test_case.py +3 -2
  31. deepeval/test_run/api.py +3 -2
  32. deepeval/test_run/cache.py +4 -3
  33. deepeval/test_run/test_run.py +24 -5
  34. deepeval/tracing/api.py +11 -10
  35. deepeval/tracing/otel/exporter.py +11 -0
  36. deepeval/tracing/patchers.py +102 -1
  37. deepeval/tracing/trace_context.py +13 -4
  38. deepeval/tracing/tracing.py +10 -1
  39. deepeval/tracing/types.py +8 -8
  40. deepeval/tracing/utils.py +9 -0
  41. deepeval/utils.py +44 -2
  42. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  43. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/RECORD +47 -37
  44. /deepeval/{openai → model_integrations}/types.py +0 -0
  45. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  46. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  47. {deepeval-3.6.9.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,103 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from deepeval.metrics.indicator import metric_progress_indicator
5
+ from deepeval.metrics.utils import (
6
+ check_llm_test_case_params,
7
+ construct_verbose_logs,
8
+ )
9
+ from deepeval.metrics.api import metric_data_manager
10
+ from deepeval.metrics import BaseMetric
11
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
12
+
13
+
14
+ class PatternMatchMetric(BaseMetric):
15
+ _required_params: List[LLMTestCaseParams] = [
16
+ LLMTestCaseParams.INPUT,
17
+ LLMTestCaseParams.ACTUAL_OUTPUT,
18
+ ]
19
+
20
+ def __init__(
21
+ self,
22
+ pattern: str,
23
+ ignore_case: bool = False,
24
+ threshold: float = 1.0,
25
+ verbose_mode: bool = False,
26
+ ):
27
+ self.pattern = pattern.strip()
28
+ self.ignore_case = ignore_case
29
+ self.verbose_mode = verbose_mode
30
+ self.threshold = threshold
31
+
32
+ flags = re.IGNORECASE if ignore_case else 0
33
+ try:
34
+ self._compiled_pattern = re.compile(self.pattern, flags)
35
+ except re.error as e:
36
+ raise ValueError(f"Invalid regex pattern: {pattern} — {e}")
37
+
38
+ def measure(
39
+ self,
40
+ test_case: LLMTestCase,
41
+ _show_indicator: bool = True,
42
+ _in_component: bool = False,
43
+ _log_metric_to_confident: bool = True,
44
+ ) -> float:
45
+ check_llm_test_case_params(test_case, self._required_params, self)
46
+
47
+ with metric_progress_indicator(
48
+ self, _show_indicator=_show_indicator, _in_component=_in_component
49
+ ):
50
+ actual = test_case.actual_output.strip()
51
+ full_match = self._compiled_pattern.fullmatch(actual)
52
+
53
+ self.score = 1.0 if full_match else 0.0
54
+ self.reason = (
55
+ f"The actual output fully matches the pattern."
56
+ if full_match
57
+ else f"The actual output does not match the pattern."
58
+ )
59
+ self.success = self.score >= self.threshold
60
+
61
+ if self.verbose_mode:
62
+ self.verbose_logs = construct_verbose_logs(
63
+ self,
64
+ steps=[
65
+ f"Pattern: {self.pattern}",
66
+ f"Actual: {actual}",
67
+ f"Score: {self.score:.2f}",
68
+ f"Reason: {self.reason}",
69
+ ],
70
+ )
71
+
72
+ if _log_metric_to_confident:
73
+ metric_data_manager.post_metric_if_enabled(
74
+ self, test_case=test_case
75
+ )
76
+
77
+ return self.score
78
+
79
+ async def a_measure(
80
+ self,
81
+ test_case: LLMTestCase,
82
+ _show_indicator: bool = True,
83
+ _in_component: bool = False,
84
+ ) -> float:
85
+ return self.measure(
86
+ test_case,
87
+ _show_indicator=_show_indicator,
88
+ _in_component=_in_component,
89
+ )
90
+
91
+ def is_successful(self) -> bool:
92
+ if self.error is not None:
93
+ self.success = False
94
+ else:
95
+ try:
96
+ self.success = self.score >= self.threshold
97
+ except:
98
+ self.success = False
99
+ return self.success
100
+
101
+ @property
102
+ def __name__(self):
103
+ return "Pattern Match"
@@ -36,6 +36,11 @@ class TaskCompletionMetric(BaseMetric):
36
36
  strict_mode: bool = False,
37
37
  verbose_mode: bool = False,
38
38
  ):
39
+ if task is None:
40
+ self._is_task_provided = False
41
+ else:
42
+ self._is_task_provided = True
43
+
39
44
  self.task = task
40
45
  self.threshold = 1 if strict_mode else threshold
41
46
  self.model, self.using_native_model = initialize_model(model)
@@ -73,7 +78,8 @@ class TaskCompletionMetric(BaseMetric):
73
78
  )
74
79
  else:
75
80
  task, self.outcome = self._extract_task_and_outcome(test_case)
76
- self.task = task if self.task is None else self.task
81
+ if self.task is None or not self._is_task_provided:
82
+ self.task = task
77
83
  self.verdict, self.reason = self._generate_verdicts()
78
84
  self.score = self._calculate_score()
79
85
  self.success = self.score >= self.threshold
@@ -108,7 +114,8 @@ class TaskCompletionMetric(BaseMetric):
108
114
  task, self.outcome = await self._a_extract_task_and_outcome(
109
115
  test_case
110
116
  )
111
- self.task = task if self.task is None else self.task
117
+ if self.task is None or not self._is_task_provided:
118
+ self.task = task
112
119
  self.verdict, self.reason = await self._a_generate_verdicts()
113
120
  self.score = self._calculate_score()
114
121
  self.success = self.score >= self.threshold
File without changes
@@ -0,0 +1,116 @@
1
+ import json
2
+ import uuid
3
+ from typing import Any, List, Optional
4
+
5
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
6
+ from deepeval.test_case.llm_test_case import ToolCall
7
+ from deepeval.tracing.context import (
8
+ current_span_context,
9
+ current_trace_context,
10
+ update_current_span,
11
+ update_llm_span,
12
+ )
13
+ from deepeval.tracing.trace_context import current_llm_context
14
+ from deepeval.tracing.types import ToolSpan, TraceSpanStatus
15
+ from deepeval.utils import shorten, len_long
16
+
17
+
18
+ def _update_all_attributes(
19
+ input_parameters: InputParameters,
20
+ output_parameters: OutputParameters,
21
+ expected_tools: List[ToolCall],
22
+ expected_output: str,
23
+ context: List[str],
24
+ retrieval_context: List[str],
25
+ ):
26
+ """Update span and trace attributes with input/output parameters."""
27
+ update_current_span(
28
+ input=input_parameters.input or input_parameters.messages or "NA",
29
+ output=output_parameters.output or "NA",
30
+ tools_called=output_parameters.tools_called,
31
+ # attributes to be added
32
+ expected_output=expected_output,
33
+ expected_tools=expected_tools,
34
+ context=context,
35
+ retrieval_context=retrieval_context,
36
+ )
37
+
38
+ llm_context = current_llm_context.get()
39
+
40
+ update_llm_span(
41
+ input_token_count=output_parameters.prompt_tokens,
42
+ output_token_count=output_parameters.completion_tokens,
43
+ prompt=llm_context.prompt,
44
+ )
45
+
46
+ if output_parameters.tools_called:
47
+ create_child_tool_spans(output_parameters)
48
+
49
+ __update_input_and_output_of_current_trace(
50
+ input_parameters, output_parameters
51
+ )
52
+
53
+
54
+ def __update_input_and_output_of_current_trace(
55
+ input_parameters: InputParameters, output_parameters: OutputParameters
56
+ ):
57
+
58
+ current_trace = current_trace_context.get()
59
+ if current_trace:
60
+ if current_trace.input is None:
61
+ current_trace.input = (
62
+ input_parameters.input or input_parameters.messages
63
+ )
64
+ if current_trace.output is None:
65
+ current_trace.output = output_parameters.output
66
+
67
+ return
68
+
69
+
70
+ def create_child_tool_spans(output_parameters: OutputParameters):
71
+ if output_parameters.tools_called is None:
72
+ return
73
+
74
+ current_span = current_span_context.get()
75
+ for tool_called in output_parameters.tools_called:
76
+ tool_span = ToolSpan(
77
+ **{
78
+ "uuid": str(uuid.uuid4()),
79
+ "trace_uuid": current_span.trace_uuid,
80
+ "parent_uuid": current_span.uuid,
81
+ "start_time": current_span.start_time,
82
+ "end_time": current_span.start_time,
83
+ "status": TraceSpanStatus.SUCCESS,
84
+ "children": [],
85
+ "name": tool_called.name,
86
+ "input": tool_called.input_parameters,
87
+ "output": None,
88
+ "metrics": None,
89
+ "description": tool_called.description,
90
+ }
91
+ )
92
+ current_span.children.append(tool_span)
93
+
94
+
95
+ _URL_MAX = 200
96
+ _JSON_MAX = max(
97
+ len_long(), 400
98
+ ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
99
+
100
+
101
+ def compact_dump(value: Any) -> str:
102
+ try:
103
+ dumped = json.dumps(
104
+ value, ensure_ascii=False, default=str, separators=(",", ":")
105
+ )
106
+ except Exception:
107
+ dumped = repr(value)
108
+ return shorten(dumped, max_len=_JSON_MAX)
109
+
110
+
111
+ def fmt_url(url: Optional[str]) -> str:
112
+ if not url:
113
+ return ""
114
+ if url.startswith("data:"):
115
+ return "[data-uri]"
116
+ return shorten(url, max_len=_URL_MAX)
@@ -68,7 +68,9 @@ class DeepEvalBaseLLM(ABC):
68
68
  Returns:
69
69
  A list of strings.
70
70
  """
71
- raise AttributeError
71
+ raise NotImplementedError(
72
+ "batch_generate is not implemented for this model"
73
+ )
72
74
 
73
75
  @abstractmethod
74
76
  def get_model_name(self, *args, **kwargs) -> str:
@@ -15,5 +15,7 @@ except ImportError:
15
15
 
16
16
  if OpenAI or AsyncOpenAI:
17
17
  from deepeval.openai.patch import patch_openai_classes
18
+ from deepeval.telemetry import capture_tracing_integration
18
19
 
19
- patch_openai_classes()
20
+ with capture_tracing_integration("openai"):
21
+ patch_openai_classes()
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
4
4
  from openai.types.responses import Response
5
5
 
6
6
  from deepeval.test_case.llm_test_case import ToolCall
7
+
8
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
7
9
  from deepeval.openai.utils import (
8
10
  render_response_input,
9
11
  stringify_multimodal_content,
10
12
  render_messages,
11
13
  )
12
- from deepeval.openai.types import InputParameters, OutputParameters
13
- from deepeval.tracing.types import Message
14
14
 
15
15
 
16
16
  # guarding against errors to be compatible with legacy APIs
deepeval/openai/utils.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import uuid
3
- from typing import Any, Dict, List, Optional, Iterable
3
+ from typing import Any, Dict, List, Iterable
4
4
 
5
5
  from openai.types.chat.chat_completion_message_param import (
6
6
  ChatCompletionMessageParam,
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
8
8
 
9
9
  from deepeval.tracing.types import ToolSpan, TraceSpanStatus
10
10
  from deepeval.tracing.context import current_span_context
11
- from deepeval.utils import shorten, len_long
12
- from deepeval.openai.types import OutputParameters
13
-
14
-
15
- _URL_MAX = 200
16
- _JSON_MAX = max(
17
- len_long(), 400
18
- ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
19
-
20
-
21
- def _compact_dump(value: Any) -> str:
22
- try:
23
- dumped = json.dumps(
24
- value, ensure_ascii=False, default=str, separators=(",", ":")
25
- )
26
- except Exception:
27
- dumped = repr(value)
28
- return shorten(dumped, max_len=_JSON_MAX)
29
-
30
-
31
- def _fmt_url(url: Optional[str]) -> str:
32
- if not url:
33
- return ""
34
- if url.startswith("data:"):
35
- return "[data-uri]"
36
- return shorten(url, max_len=_URL_MAX)
11
+ from deepeval.model_integrations.types import OutputParameters
12
+ from deepeval.model_integrations.utils import compact_dump, fmt_url
37
13
 
38
14
 
39
15
  def create_child_tool_spans(output_parameters: OutputParameters):
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
111
87
  url = image_url
112
88
  else:
113
89
  url = (image_url or {}).get("url") or content.get("url")
114
- return f"[image:{_fmt_url(url)}]"
90
+ return f"[image:{fmt_url(url)}]"
115
91
 
116
92
  # Responses API variants
117
93
  if t == "input_text":
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
122
98
  url = image_url
123
99
  else:
124
100
  url = (image_url or {}).get("url") or content.get("url")
125
- return f"[image:{_fmt_url(url)}]"
101
+ return f"[image:{fmt_url(url)}]"
126
102
 
127
103
  # readability for other input_* types we don't currently handle
128
104
  if t and t.startswith("input_"):
129
105
  return f"[{t}]"
130
106
 
131
107
  # unknown dicts and types returned as shortened JSON
132
- return _compact_dump(content)
108
+ return compact_dump(content)
133
109
 
134
110
 
135
111
  def render_messages(
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
228
204
  lines.append(f"{prefix}{key}:")
229
205
  lines.append(_render_content(value, indent + 1))
230
206
  elif isinstance(value, list):
231
- lines.append(f"{prefix}{key}: {_compact_dump(value)}")
207
+ lines.append(f"{prefix}{key}: {compact_dump(value)}")
232
208
  else:
233
209
  lines.append(f"{prefix}{key}: {value}")
234
210
 
deepeval/prompt/api.py CHANGED
@@ -1,8 +1,10 @@
1
- from pydantic import BaseModel, Field, AliasChoices
1
+ from pydantic import BaseModel, Field, AliasChoices, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
4
  from pydantic import TypeAdapter
5
5
 
6
+ from deepeval.utils import make_model_config
7
+
6
8
  ###################################
7
9
  # Model Settings
8
10
  ###################################
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
92
94
 
93
95
 
94
96
  class OutputSchemaField(BaseModel):
97
+ model_config = make_model_config(use_enum_values=True)
98
+
95
99
  id: str
96
100
  type: SchemaDataType
97
101
  name: str
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
102
106
  validation_alias=AliasChoices("parent_id", "parentId"),
103
107
  )
104
108
 
105
- class Config:
106
- use_enum_values = True
107
-
108
109
 
109
110
  class OutputSchema(BaseModel):
110
111
  fields: Optional[List[OutputSchemaField]] = None
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
187
188
 
188
189
 
189
190
  class PromptPushRequest(BaseModel):
191
+ model_config = make_model_config(use_enum_values=True)
192
+
193
+ model_config = ConfigDict(use_enum_values=True)
194
+
190
195
  alias: str
191
196
  text: Optional[str] = None
192
197
  messages: Optional[List[PromptMessage]] = None
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
203
208
  default=None, serialization_alias="outputType"
204
209
  )
205
210
 
206
- class Config:
207
- use_enum_values = True
208
-
209
211
 
210
212
  class PromptUpdateRequest(BaseModel):
213
+ model_config = make_model_config(use_enum_values=True)
214
+
211
215
  text: Optional[str] = None
212
216
  messages: Optional[List[PromptMessage]] = None
213
217
  interpolation_type: PromptInterpolationType = Field(
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
223
227
  default=None, serialization_alias="outputType"
224
228
  )
225
229
 
226
- class Config:
227
- use_enum_values = True
228
-
229
230
 
230
231
  class PromptApi(BaseModel):
231
232
  id: str
deepeval/prompt/prompt.py CHANGED
@@ -5,11 +5,13 @@ from rich.console import Console
5
5
  import time
6
6
  import json
7
7
  import os
8
- from pydantic import BaseModel, ValidationError
8
+ from pydantic import BaseModel, ValidationError, ConfigDict
9
9
  import asyncio
10
10
  import portalocker
11
11
  import threading
12
12
 
13
+ from deepeval.utils import make_model_config
14
+
13
15
  from deepeval.prompt.api import (
14
16
  PromptHttpResponse,
15
17
  PromptMessage,
@@ -77,6 +79,8 @@ class CustomEncoder(json.JSONEncoder):
77
79
 
78
80
 
79
81
  class CachedPrompt(BaseModel):
82
+ model_config = make_model_config(use_enum_values=True)
83
+
80
84
  alias: str
81
85
  version: str
82
86
  label: Optional[str] = None
@@ -89,9 +93,6 @@ class CachedPrompt(BaseModel):
89
93
  output_type: Optional[OutputType]
90
94
  output_schema: Optional[OutputSchema]
91
95
 
92
- class Config:
93
- use_enum_values = True
94
-
95
96
 
96
97
  class Prompt:
97
98
 
deepeval/telemetry.py CHANGED
@@ -3,12 +3,12 @@ import logging
3
3
  import os
4
4
  import socket
5
5
  import sys
6
- from threading import Event
7
6
  import uuid
8
7
  import sentry_sdk
9
8
  from enum import Enum
10
9
  from typing import List, Dict
11
10
  import requests
11
+ from deepeval.config.settings import get_settings
12
12
  from deepeval.constants import LOGIN_PROMPT, HIDDEN_DIR, KEY_FILE
13
13
  from posthog import Posthog
14
14
 
@@ -34,7 +34,7 @@ TELEMETRY_PATH = os.path.join(HIDDEN_DIR, TELEMETRY_DATA_FILE)
34
34
 
35
35
 
36
36
  def telemetry_opt_out():
37
- return os.getenv("DEEPEVAL_TELEMETRY_OPT_OUT") == "1"
37
+ return get_settings().DEEPEVAL_TELEMETRY_OPT_OUT
38
38
 
39
39
 
40
40
  def blocked_by_firewall():
@@ -131,7 +131,7 @@ if not telemetry_opt_out():
131
131
 
132
132
 
133
133
  if (
134
- os.getenv("ERROR_REPORTING") == "1"
134
+ get_settings().ERROR_REPORTING
135
135
  and not blocked_by_firewall()
136
136
  and not telemetry_opt_out()
137
137
  ):
@@ -1,5 +1,4 @@
1
1
  from pydantic import (
2
- ConfigDict,
3
2
  Field,
4
3
  BaseModel,
5
4
  model_validator,
@@ -11,6 +10,8 @@ from enum import Enum
11
10
  import json
12
11
  import uuid
13
12
 
13
+ from deepeval.utils import make_model_config
14
+
14
15
  from deepeval.test_case.mcp import (
15
16
  MCPServer,
16
17
  MCPPromptCall,
@@ -156,7 +157,7 @@ class ToolCall(BaseModel):
156
157
 
157
158
 
158
159
  class LLMTestCase(BaseModel):
159
- model_config = ConfigDict(extra="ignore")
160
+ model_config = make_model_config(extra="ignore")
160
161
 
161
162
  input: str
162
163
  actual_output: Optional[str] = Field(
deepeval/test_run/api.py CHANGED
@@ -1,8 +1,9 @@
1
- from pydantic import BaseModel, Field, ConfigDict
1
+ from pydantic import BaseModel, Field
2
2
  from typing import Optional, List, Union, Dict
3
3
 
4
4
  from deepeval.test_case import MLLMImage, ToolCall
5
5
  from deepeval.tracing.api import TraceApi, MetricData
6
+ from deepeval.utils import make_model_config
6
7
 
7
8
 
8
9
  class LLMApiTestCase(BaseModel):
@@ -49,7 +50,7 @@ class LLMApiTestCase(BaseModel):
49
50
  comments: Optional[str] = Field(None)
50
51
  trace: Optional[TraceApi] = Field(None)
51
52
 
52
- model_config = ConfigDict(arbitrary_types_allowed=True)
53
+ model_config = make_model_config(arbitrary_types_allowed=True)
53
54
  # metric_collection: Optional[str] = Field(None, alias="metricCollection")
54
55
 
55
56
  def update_metric_data(self, metric_data: MetricData):
@@ -6,6 +6,8 @@ from typing import List, Optional, Union, Dict, Union
6
6
  from enum import Enum
7
7
  from pydantic import BaseModel, Field
8
8
 
9
+ from deepeval.utils import make_model_config
10
+
9
11
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
10
12
  from deepeval.test_run.api import MetricData
11
13
  from deepeval.utils import (
@@ -20,6 +22,8 @@ TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
20
22
 
21
23
 
22
24
  class MetricConfiguration(BaseModel):
25
+ model_config = make_model_config(arbitrary_types_allowed=True)
26
+
23
27
  ##### Required fields #####
24
28
  threshold: float
25
29
  evaluation_model: Optional[str] = None
@@ -36,9 +40,6 @@ class MetricConfiguration(BaseModel):
36
40
  Union[List[LLMTestCaseParams], List[ToolCallParams]]
37
41
  ] = None
38
42
 
39
- class Config:
40
- arbitrary_types_allowed = True
41
-
42
43
 
43
44
  class CachedMetricData(BaseModel):
44
45
  metric_data: MetricData
@@ -463,19 +463,29 @@ class TestRunManager:
463
463
  mode="r",
464
464
  flags=portalocker.LOCK_SH | portalocker.LOCK_NB,
465
465
  ) as file:
466
- self.test_run = self.test_run.load(file)
466
+ loaded = self.test_run.load(file)
467
+ # only overwrite if loading actually worked
468
+ self.test_run = loaded
467
469
  except (
468
470
  FileNotFoundError,
471
+ json.JSONDecodeError,
469
472
  portalocker.exceptions.LockException,
470
473
  ) as e:
471
- print(f"Error loading test run from disk: {e}", file=sys.stderr)
472
- self.test_run = None
474
+ print(
475
+ f"Warning: Could not load test run from disk: {e}",
476
+ file=sys.stderr,
477
+ )
473
478
 
474
479
  return self.test_run
475
480
 
476
481
  def save_test_run(self, path: str, save_under_key: Optional[str] = None):
477
482
  if self.save_to_disk:
478
483
  try:
484
+ # ensure parent directory exists
485
+ parent = os.path.dirname(path)
486
+ if parent:
487
+ os.makedirs(parent, exist_ok=True)
488
+
479
489
  with portalocker.Lock(path, mode="w") as file:
480
490
  if save_under_key:
481
491
  try:
@@ -533,10 +543,19 @@ class TestRunManager:
533
543
  self.test_run.save(file)
534
544
  except (
535
545
  FileNotFoundError,
546
+ json.JSONDecodeError,
536
547
  portalocker.exceptions.LockException,
537
548
  ) as e:
538
- print(f"Error updating test run to disk: {e}", file=sys.stderr)
539
- self.test_run = None
549
+ print(
550
+ f"Warning: Could not update test run on disk: {e}",
551
+ file=sys.stderr,
552
+ )
553
+ if self.test_run is None:
554
+ # guarantee a valid in-memory run so the update can proceed.
555
+ # never destroy in-memory state on I/O failure.
556
+ self.create_test_run()
557
+ self.test_run.add_test_case(api_test_case)
558
+ self.test_run.set_dataset_properties(test_case)
540
559
  else:
541
560
  if self.test_run is None:
542
561
  self.create_test_run()
deepeval/tracing/api.py CHANGED
@@ -1,8 +1,9 @@
1
1
  from enum import Enum
2
2
  from typing import Dict, List, Optional, Union, Literal, Any
3
- from pydantic import BaseModel, ConfigDict, Field
3
+ from pydantic import BaseModel, Field
4
4
 
5
5
  from deepeval.test_case import ToolCall
6
+ from deepeval.utils import make_model_config
6
7
 
7
8
 
8
9
  class SpanApiType(Enum):
@@ -27,7 +28,7 @@ class PromptApi(BaseModel):
27
28
 
28
29
 
29
30
  class MetricData(BaseModel):
30
- model_config = ConfigDict(extra="ignore")
31
+ model_config = make_model_config(extra="ignore")
31
32
 
32
33
  name: str
33
34
  threshold: float
@@ -42,6 +43,10 @@ class MetricData(BaseModel):
42
43
 
43
44
 
44
45
  class BaseApiSpan(BaseModel):
46
+ model_config = make_model_config(
47
+ use_enum_values=True, validate_assignment=True
48
+ )
49
+
45
50
  uuid: str
46
51
  name: str = None
47
52
  status: TraceSpanApiStatus
@@ -96,12 +101,12 @@ class BaseApiSpan(BaseModel):
96
101
  metric_collection: Optional[str] = Field(None, alias="metricCollection")
97
102
  metrics_data: Optional[List[MetricData]] = Field(None, alias="metricsData")
98
103
 
99
- class Config:
100
- use_enum_values = True
101
- validate_assignment = True
102
-
103
104
 
104
105
  class TraceApi(BaseModel):
106
+ model_config = make_model_config(
107
+ use_enum_values=True, validate_assignment=True
108
+ )
109
+
105
110
  uuid: str
106
111
  base_spans: Optional[List[BaseApiSpan]] = Field(None, alias="baseSpans")
107
112
  agent_spans: Optional[List[BaseApiSpan]] = Field(None, alias="agentSpans")
@@ -139,7 +144,3 @@ class TraceApi(BaseModel):
139
144
 
140
145
  # Don't serialize these
141
146
  confident_api_key: Optional[str] = Field(None, exclude=True)
142
-
143
- class Config:
144
- use_enum_values = True
145
- validate_assignment = True