deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
  import asyncio
3
3
 
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.test_case import (
5
6
  LLMTestCase,
6
7
  LLMTestCaseParams,
@@ -73,6 +74,7 @@ class SummarizationMetric(BaseMetric):
73
74
  test_case: LLMTestCase,
74
75
  _show_indicator: bool = True,
75
76
  _in_component: bool = False,
77
+ _log_metric_to_confident: bool = True,
76
78
  ) -> float:
77
79
 
78
80
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -88,6 +90,7 @@ class SummarizationMetric(BaseMetric):
88
90
  test_case,
89
91
  _show_indicator=False,
90
92
  _in_component=_in_component,
93
+ _log_metric_to_confident=_log_metric_to_confident,
91
94
  )
92
95
  )
93
96
  else:
@@ -121,7 +124,10 @@ class SummarizationMetric(BaseMetric):
121
124
  f"Score: {self.score}\nReason: {self.reason}",
122
125
  ],
123
126
  )
124
-
127
+ if _log_metric_to_confident:
128
+ metric_data_manager.post_metric_if_enabled(
129
+ self, test_case=test_case
130
+ )
125
131
  return self.score
126
132
 
127
133
  async def a_measure(
@@ -129,6 +135,7 @@ class SummarizationMetric(BaseMetric):
129
135
  test_case: LLMTestCase,
130
136
  _show_indicator: bool = True,
131
137
  _in_component: bool = False,
138
+ _log_metric_to_confident: bool = True,
132
139
  ) -> float:
133
140
 
134
141
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -171,6 +178,10 @@ class SummarizationMetric(BaseMetric):
171
178
  f"Score: {self.score}\nReason: {self.reason}",
172
179
  ],
173
180
  )
181
+ if _log_metric_to_confident:
182
+ metric_data_manager.post_metric_if_enabled(
183
+ self, test_case=test_case
184
+ )
174
185
 
175
186
  return self.score
176
187
 
@@ -50,6 +50,7 @@ class TaskCompletionMetric(BaseMetric):
50
50
  test_case: LLMTestCase,
51
51
  _show_indicator: bool = True,
52
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
53
54
  ) -> float:
54
55
  has_trace: bool = isinstance(test_case._trace_dict, Dict)
55
56
  if not has_trace:
@@ -66,6 +67,7 @@ class TaskCompletionMetric(BaseMetric):
66
67
  test_case,
67
68
  _show_indicator=False,
68
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
69
71
  )
70
72
  )
71
73
  else:
@@ -89,6 +91,7 @@ class TaskCompletionMetric(BaseMetric):
89
91
  test_case: LLMTestCase,
90
92
  _show_indicator: bool = True,
91
93
  _in_component: bool = False,
94
+ _log_metric_to_confident: bool = True,
92
95
  ) -> float:
93
96
  has_trace: bool = isinstance(test_case._trace_dict, Dict)
94
97
  if not has_trace:
@@ -12,6 +12,7 @@ from deepeval.test_case import (
12
12
  ToolCall,
13
13
  )
14
14
  from deepeval.metrics import BaseMetric
15
+ from deepeval.metrics.api import metric_data_manager
15
16
 
16
17
 
17
18
  class ToolCorrectnessMetric(BaseMetric):
@@ -45,6 +46,7 @@ class ToolCorrectnessMetric(BaseMetric):
45
46
  test_case: LLMTestCase,
46
47
  _show_indicator: bool = True,
47
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
48
50
  ) -> float:
49
51
 
50
52
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -83,6 +85,11 @@ class ToolCorrectnessMetric(BaseMetric):
83
85
  ]
84
86
  steps.append(f"Score: {self.score}\nReason: {self.reason}")
85
87
  self.verbose_logs = construct_verbose_logs(self, steps=steps)
88
+
89
+ if _log_metric_to_confident:
90
+ metric_data_manager.post_metric_if_enabled(
91
+ self, test_case=test_case
92
+ )
86
93
  return self.score
87
94
 
88
95
  async def a_measure(
@@ -90,6 +97,7 @@ class ToolCorrectnessMetric(BaseMetric):
90
97
  test_case: LLMTestCase,
91
98
  _show_indicator: bool = True,
92
99
  _in_component: bool = False,
100
+ _log_metric_to_confident: bool = True,
93
101
  ) -> float:
94
102
  return self.measure(
95
103
  test_case,
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.toxicity.template import ToxicityTemplate
19
19
  from deepeval.metrics.toxicity.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class ToxicityMetric(BaseMetric):
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
50
51
  test_case: LLMTestCase,
51
52
  _show_indicator: bool = True,
52
53
  _in_component: bool = False,
54
+ _log_metric_to_confident: bool = True,
53
55
  ) -> float:
54
56
 
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
125
137
 
126
138
  return self.score
127
139
 
@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
21
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
22
22
  from deepeval.metrics.turn_relevancy.schema import *
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class TurnRelevancyMetric(BaseConversationalMetric):
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
49
50
  test_case: ConversationalTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ):
53
55
  check_conversational_test_case_params(
54
56
  test_case, self._required_test_case_params, self
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
91
94
  f"Score: {self.score}\nReason: {self.reason}",
92
95
  ],
93
96
  )
97
+ if _log_metric_to_confident:
98
+ metric_data_manager.post_metric_if_enabled(
99
+ self, test_case=test_case
100
+ )
94
101
  return self.score
95
102
 
96
103
  async def a_measure(
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
98
105
  test_case: ConversationalTestCase,
99
106
  _show_indicator: bool = True,
100
107
  _in_component: bool = False,
108
+ _log_metric_to_confident: bool = True,
101
109
  ) -> float:
102
110
  check_conversational_test_case_params(
103
111
  test_case, self._required_test_case_params, self
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
134
142
  f"Score: {self.score}\nReason: {self.reason}",
135
143
  ],
136
144
  )
145
+ if _log_metric_to_confident:
146
+ metric_data_manager.post_metric_if_enabled(
147
+ self, test_case=test_case
148
+ )
137
149
  return self.score
138
150
 
139
151
  async def _a_generate_reason(self) -> str:
@@ -56,8 +56,8 @@ model_pricing = {
56
56
  class GrokModel(DeepEvalBaseLLM):
57
57
  def __init__(
58
58
  self,
59
- api_key: Optional[str] = None,
60
59
  model: Optional[str] = None,
60
+ api_key: Optional[str] = None,
61
61
  temperature: float = 0,
62
62
  generation_kwargs: Optional[Dict] = None,
63
63
  **kwargs,
@@ -70,6 +70,8 @@ unsupported_log_probs_gpt_models = [
70
70
  "o1-mini-2024-09-12",
71
71
  "o3-mini",
72
72
  "o3-mini-2025-01-31",
73
+ "o4-mini",
74
+ "o4-mini-2025-04-16",
73
75
  "gpt-4.5-preview-2025-02-27",
74
76
  "gpt-5",
75
77
  "gpt-5-2025-08-07",
@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
33
33
 
34
34
  from __future__ import annotations
35
35
 
36
+ import asyncio
37
+ import inspect
38
+ import itertools
39
+ import functools
40
+ import threading
36
41
  import logging
37
42
 
38
- from deepeval.utils import read_env_int, read_env_float
39
43
  from dataclasses import dataclass, field
40
44
  from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
41
45
  from collections.abc import Mapping as ABCMapping
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
58
62
 
59
63
  logger = logging.getLogger(__name__)
60
64
  Provider = Union[str, PS]
65
+ _MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
66
+ _TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
67
+ _WORKER_ID = itertools.count(1)
61
68
 
62
69
  # --------------------------
63
70
  # Policy description
@@ -184,6 +191,12 @@ def extract_error_code(
184
191
  # Predicate factory
185
192
  # --------------------------
186
193
 
194
+ _BUILTIN_TIMEOUT_EXCS = (
195
+ (TimeoutError,)
196
+ if asyncio.TimeoutError is TimeoutError
197
+ else (TimeoutError, asyncio.TimeoutError)
198
+ )
199
+
187
200
 
188
201
  def make_is_transient(
189
202
  policy: ErrorPolicy,
@@ -213,6 +226,9 @@ def make_is_transient(
213
226
  )
214
227
 
215
228
  def _pred(e: Exception) -> bool:
229
+ if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
230
+ return True
231
+
216
232
  if isinstance(e, policy.auth_excs):
217
233
  return False
218
234
 
@@ -245,18 +261,23 @@ def make_is_transient(
245
261
 
246
262
  class StopFromEnv(stop_base):
247
263
  def __call__(self, retry_state):
248
- attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
264
+ settings = get_settings()
265
+ attempts = (
266
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
267
+ ) # TODO: add constraints in settings
249
268
  return stop_after_attempt(attempts)(retry_state)
250
269
 
251
270
 
252
271
  class WaitFromEnv(wait_base):
253
272
  def __call__(self, retry_state):
254
- initial = read_env_float(
255
- "DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
256
- )
257
- exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
258
- jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
259
- cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
273
+ settings = get_settings()
274
+ initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
275
+ exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
276
+ jitter = settings.DEEPEVAL_RETRY_JITTER
277
+ cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
278
+
279
+ if cap == 0: # <- 0 means no backoff sleeps or jitter
280
+ return 0
260
281
  return wait_exponential_jitter(
261
282
  initial=initial, exp_base=exp_base, jitter=jitter, max=cap
262
283
  )(retry_state)
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
324
345
 
325
346
  def _retry_log_levels():
326
347
  s = get_settings()
348
+ base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
327
349
  before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
328
350
  after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
329
351
  return (
330
- before_level if before_level is not None else logging.INFO,
352
+ before_level if before_level is not None else base_level,
331
353
  after_level if after_level is not None else logging.ERROR,
332
354
  )
333
355
 
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
394
416
  return _after
395
417
 
396
418
 
419
+ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
420
+ settings = get_settings()
421
+ if logger.isEnabledFor(logging.DEBUG):
422
+ logger.debug(
423
+ "retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
424
+ timeout_seconds,
425
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
426
+ settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
427
+ )
428
+ msg = (
429
+ f"call timed out after {timeout_seconds:g}s (per attempt). "
430
+ "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
431
+ )
432
+ return TimeoutError(msg)
433
+
434
+
435
+ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
436
+ """
437
+ Run a synchronous callable with a soft timeout enforced by a helper thread,
438
+ with a global cap on concurrent timeout-workers.
439
+
440
+ How it works
441
+ ------------
442
+ - A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
443
+ gates creation of timeout worker threads. If no permit is available, this call
444
+ blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
445
+ > 0 and acquisition takes longer than that, a warning is logged before continuing
446
+ to wait.
447
+ - Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
448
+ - We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
449
+ `TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
450
+ - If the worker finishes in time, we return its result or re-raise its exception
451
+ (with original traceback).
452
+
453
+ Cancellation semantics
454
+ ----------------------
455
+ This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
456
+ are rare this is fine. If timeouts are common, consider moving to:
457
+ - a shared ThreadPoolExecutor (caps threads and amortizes creation), or
458
+ - worker process (supports killing in-flight processes)
459
+
460
+ Concurrency control & logging
461
+ -----------------------------
462
+ - Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
463
+ - If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
464
+ warning and then block until a slot is available.
465
+ - On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
466
+ thread sample to help diagnose pressure.
467
+
468
+ Args:
469
+ func: Synchronous callable to execute.
470
+ timeout_seconds: Float seconds for the soft timeout (0/None disables).
471
+ *args, **kwargs: Passed through to `func`.
472
+
473
+ Returns:
474
+ Whatever `func` returns.
475
+
476
+ Raises:
477
+ TimeoutError: If `timeout_seconds` elapse before completion.
478
+ BaseException: If `func` raises, the same exception is re-raised with its
479
+ original traceback.
480
+ """
481
+ if not timeout_seconds or timeout_seconds <= 0:
482
+ return func(*args, **kwargs)
483
+
484
+ # try to respect the global cap on concurrent timeout workers
485
+ warn_after = float(
486
+ get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
487
+ )
488
+ if warn_after > 0:
489
+ acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
490
+ if not acquired:
491
+ logger.warning(
492
+ "timeout thread limit reached (%d); waiting for a slot...",
493
+ _MAX_TIMEOUT_THREADS,
494
+ )
495
+ _TIMEOUT_SEMA.acquire()
496
+ else:
497
+ _TIMEOUT_SEMA.acquire()
498
+
499
+ done = threading.Event()
500
+ result = {"value": None, "exc": None}
501
+
502
+ def target():
503
+ try:
504
+ result["value"] = func(*args, **kwargs)
505
+ except BaseException as e:
506
+ result["exc"] = e
507
+ finally:
508
+ done.set()
509
+ _TIMEOUT_SEMA.release()
510
+
511
+ t = threading.Thread(
512
+ target=target,
513
+ daemon=True,
514
+ name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
515
+ )
516
+
517
+ try:
518
+ t.start()
519
+ except BaseException:
520
+ _TIMEOUT_SEMA.release()
521
+ raise
522
+
523
+ finished = done.wait(timeout_seconds)
524
+ if not finished:
525
+ if (
526
+ logger.isEnabledFor(logging.DEBUG)
527
+ and get_settings().DEEPEVAL_VERBOSE_MODE
528
+ ):
529
+ names = [th.name for th in threading.enumerate()[:10]]
530
+ logger.debug(
531
+ "timeout after %.3fs (active_threads=%d, sample=%s)",
532
+ timeout_seconds,
533
+ threading.active_count(),
534
+ names,
535
+ )
536
+ raise _make_timeout_error(timeout_seconds)
537
+
538
+ # Completed within time: return or raise
539
+ if result["exc"] is not None:
540
+ exc = result["exc"]
541
+ raise exc.with_traceback(getattr(exc, "__traceback__", None))
542
+ return result["value"]
543
+
544
+
397
545
  def create_retry_decorator(provider: Provider):
398
546
  """
399
547
  Build a Tenacity @retry decorator wired to our dynamic retry policy
400
548
  for the given provider slug.
401
549
  """
402
550
  slug = slugify(provider)
403
-
404
- return retry(
551
+ base_retry = retry(
405
552
  wait=dynamic_wait(),
406
553
  stop=dynamic_stop(),
407
554
  retry=dynamic_retry(slug),
408
555
  before_sleep=make_before_sleep_log(slug),
409
556
  after=make_after_log(slug),
557
+ reraise=False,
410
558
  )
411
559
 
560
+ def _decorator(func):
561
+ if inspect.iscoroutinefunction(func):
562
+
563
+ @functools.wraps(func)
564
+ async def attempt(*args, **kwargs):
565
+ timeout_seconds = (
566
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
567
+ )
568
+ coro = func(*args, **kwargs)
569
+ if timeout_seconds > 0:
570
+ try:
571
+ return await asyncio.wait_for(coro, timeout_seconds)
572
+ except asyncio.TimeoutError as e:
573
+ if (
574
+ logger.isEnabledFor(logging.DEBUG)
575
+ and get_settings().DEEPEVAL_VERBOSE_MODE is True
576
+ ):
577
+ logger.debug(
578
+ "async timeout after %.3fs (active_threads=%d, tasks=%d)",
579
+ timeout_seconds,
580
+ threading.active_count(),
581
+ len(asyncio.all_tasks()),
582
+ )
583
+ raise _make_timeout_error(timeout_seconds) from e
584
+ return await coro
585
+
586
+ return base_retry(attempt)
587
+
588
+ @functools.wraps(func)
589
+ def attempt(*args, **kwargs):
590
+ timeout_seconds = (
591
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
592
+ )
593
+ if timeout_seconds > 0:
594
+ return _run_sync_with_timeout(
595
+ func, timeout_seconds, *args, **kwargs
596
+ )
597
+ return func(*args, **kwargs)
598
+
599
+ return base_retry(attempt)
600
+
601
+ return _decorator
602
+
412
603
 
413
604
  def _httpx_net_excs() -> tuple[type, ...]:
414
605
  try:
@@ -1,37 +1,19 @@
1
- from importlib.machinery import SourceFileLoader
2
- import importlib.util
3
- import sys
4
-
5
- from deepeval.openai.patch import patch_openai
1
+ try:
2
+ import openai # noqa: F401
3
+ except ImportError:
4
+ raise ModuleNotFoundError(
5
+ "Please install OpenAI to use this feature: 'pip install openai'"
6
+ )
6
7
 
7
8
 
8
- def load_and_patch_openai():
9
- openai_spec = importlib.util.find_spec("openai")
10
- if not openai_spec or not openai_spec.origin:
11
- raise ImportError("Could not find the OpenAI package")
12
- package_dirs = openai_spec.submodule_search_locations
13
- loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
14
- new_spec = importlib.util.spec_from_loader(
15
- "deepeval_openai",
16
- loader,
17
- origin=openai_spec.origin,
18
- is_package=True,
19
- )
20
- deepeval_openai = importlib.util.module_from_spec(new_spec)
21
- deepeval_openai.__path__ = package_dirs
22
- sys.modules["deepeval_openai"] = deepeval_openai
23
- loader.exec_module(deepeval_openai)
24
- patch_openai(deepeval_openai)
25
- return deepeval_openai
9
+ try:
10
+ from openai import OpenAI, AsyncOpenAI # noqa: F401
11
+ except ImportError:
12
+ OpenAI = None # type: ignore
13
+ AsyncOpenAI = None # type: ignore
26
14
 
27
15
 
28
- patched_openai = load_and_patch_openai()
29
- openai = patched_openai
30
- OpenAI = patched_openai.OpenAI
31
- AsyncOpenAI = patched_openai.AsyncOpenAI
16
+ if OpenAI or AsyncOpenAI:
17
+ from deepeval.openai.patch import patch_openai_classes
32
18
 
33
- __all__ = [
34
- "openai",
35
- "OpenAI",
36
- "AsyncOpenAI",
37
- ]
19
+ patch_openai_classes()