PyPI - deepeval - Versions diffs - 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl - Mend

deepeval 3.6.5py3-none-any.whl → 3.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

deepeval/__init__.py +42 -10
deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/logging.py +33 -0
deepeval/config/settings.py +176 -16
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +118 -60
deepeval/evaluate/utils.py +20 -116
deepeval/integrations/crewai/__init__.py +6 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +37 -15
deepeval/metrics/hallucination/hallucination.py +12 -1
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +13 -0
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +3 -0
deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/models/retry_policy.py +202 -11
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +24 -34
deepeval/openai/patch.py +256 -161
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +98 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +244 -62
deepeval/prompt/utils.py +144 -2
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +8 -5
deepeval/test_case/api.py +131 -0
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +104 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/message_types/__init__.py +10 -0
deepeval/tracing/message_types/base.py +6 -0
deepeval/tracing/message_types/messages.py +14 -0
deepeval/tracing/message_types/tools.py +18 -0
deepeval/tracing/otel/exporter.py +0 -6
deepeval/tracing/otel/utils.py +58 -8
deepeval/tracing/trace_context.py +73 -4
deepeval/tracing/trace_test_manager.py +19 -0
deepeval/tracing/tracing.py +52 -4
deepeval/tracing/types.py +16 -0
deepeval/tracing/utils.py +8 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
{deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0

deepeval/metrics/summarization/summarization.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Optional, Union
 import asyncio
+from deepeval.metrics.api import metric_data_manager
 from deepeval.test_case import (
     LLMTestCase,
     LLMTestCaseParams,
@@ -73,6 +74,7 @@ class SummarizationMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -88,6 +90,7 @@ class SummarizationMetric(BaseMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -121,7 +124,10 @@ class SummarizationMetric(BaseMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
     async def a_measure(
@@ -129,6 +135,7 @@ class SummarizationMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -171,6 +178,10 @@ class SummarizationMetric(BaseMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score

deepeval/metrics/task_completion/task_completion.py CHANGED Viewed

@@ -50,6 +50,7 @@ class TaskCompletionMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         has_trace: bool = isinstance(test_case._trace_dict, Dict)
         if not has_trace:
@@ -66,6 +67,7 @@ class TaskCompletionMetric(BaseMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -89,6 +91,7 @@ class TaskCompletionMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         has_trace: bool = isinstance(test_case._trace_dict, Dict)
         if not has_trace:

deepeval/metrics/tool_correctness/tool_correctness.py CHANGED Viewed

@@ -12,6 +12,7 @@ from deepeval.test_case import (
     ToolCall,
 )
 from deepeval.metrics import BaseMetric
+from deepeval.metrics.api import metric_data_manager
 class ToolCorrectnessMetric(BaseMetric):
@@ -45,6 +46,7 @@ class ToolCorrectnessMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -83,6 +85,11 @@ class ToolCorrectnessMetric(BaseMetric):
             ]
             steps.append(f"Score: {self.score}\nReason: {self.reason}")
             self.verbose_logs = construct_verbose_logs(self, steps=steps)
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score
     async def a_measure(
@@ -90,6 +97,7 @@ class ToolCorrectnessMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         return self.measure(
             test_case,

deepeval/metrics/toxicity/toxicity.py CHANGED Viewed

@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
 )
 from deepeval.metrics.toxicity.template import ToxicityTemplate
 from deepeval.metrics.toxicity.schema import *
+from deepeval.metrics.api import metric_data_manager
 class ToxicityMetric(BaseMetric):
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score

deepeval/metrics/turn_relevancy/turn_relevancy.py CHANGED Viewed

@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
 from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
 from deepeval.utils import get_or_create_event_loop, prettify_list
 from deepeval.metrics.turn_relevancy.schema import *
+from deepeval.metrics.api import metric_data_manager
 class TurnRelevancyMetric(BaseConversationalMetric):
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
         test_case: ConversationalTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ):
         check_conversational_test_case_params(
             test_case, self._required_test_case_params, self
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
     async def a_measure(
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
         test_case: ConversationalTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_conversational_test_case_params(
             test_case, self._required_test_case_params, self
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score
     async def _a_generate_reason(self) -> str:

deepeval/models/llms/grok_model.py CHANGED Viewed

@@ -56,8 +56,8 @@ model_pricing = {
 class GrokModel(DeepEvalBaseLLM):
     def __init__(
         self,
-        api_key: Optional[str] = None,
         model: Optional[str] = None,
+        api_key: Optional[str] = None,
         temperature: float = 0,
         generation_kwargs: Optional[Dict] = None,
         **kwargs,

deepeval/models/llms/openai_model.py CHANGED Viewed

@@ -70,6 +70,8 @@ unsupported_log_probs_gpt_models = [
     "o1-mini-2024-09-12",
     "o3-mini",
     "o3-mini-2025-01-31",
+    "o4-mini",
+    "o4-mini-2025-04-16",
     "gpt-4.5-preview-2025-02-27",
     "gpt-5",
     "gpt-5-2025-08-07",

deepeval/models/retry_policy.py CHANGED Viewed

@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
 from __future__ import annotations
+import asyncio
+import inspect
+import itertools
+import functools
+import threading
 import logging
-from deepeval.utils import read_env_int, read_env_float
 from dataclasses import dataclass, field
 from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
 from collections.abc import Mapping as ABCMapping
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
 logger = logging.getLogger(__name__)
 Provider = Union[str, PS]
+_MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
+_TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
+_WORKER_ID = itertools.count(1)
 # --------------------------
 # Policy description
@@ -184,6 +191,12 @@ def extract_error_code(
 # Predicate factory
 # --------------------------
+_BUILTIN_TIMEOUT_EXCS = (
+    (TimeoutError,)
+    if asyncio.TimeoutError is TimeoutError
+    else (TimeoutError, asyncio.TimeoutError)
+)
 def make_is_transient(
     policy: ErrorPolicy,
@@ -213,6 +226,9 @@ def make_is_transient(
     )
     def _pred(e: Exception) -> bool:
+        if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
+            return True
         if isinstance(e, policy.auth_excs):
             return False
@@ -245,18 +261,23 @@ def make_is_transient(
 class StopFromEnv(stop_base):
     def __call__(self, retry_state):
-        attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
+        settings = get_settings()
+        attempts = (
+            settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
+        )  # TODO: add constraints in settings
         return stop_after_attempt(attempts)(retry_state)
 class WaitFromEnv(wait_base):
     def __call__(self, retry_state):
-        initial = read_env_float(
-            "DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
-        )
-        exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
-        jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
-        cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
+        settings = get_settings()
+        initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
+        exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
+        jitter = settings.DEEPEVAL_RETRY_JITTER
+        cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
+        if cap == 0:  # <- 0 means no backoff sleeps or jitter
+            return 0
         return wait_exponential_jitter(
             initial=initial, exp_base=exp_base, jitter=jitter, max=cap
         )(retry_state)
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
 def _retry_log_levels():
     s = get_settings()
+    base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
     before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
     after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
     return (
-        before_level if before_level is not None else logging.INFO,
+        before_level if before_level is not None else base_level,
         after_level if after_level is not None else logging.ERROR,
     )
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
     return _after
+def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
+    settings = get_settings()
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            "retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
+            timeout_seconds,
+            settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
+            settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+        )
+    msg = (
+        f"call timed out after {timeout_seconds:g}s (per attempt). "
+        "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
+    )
+    return TimeoutError(msg)
+def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
+    """
+    Run a synchronous callable with a soft timeout enforced by a helper thread,
+    with a global cap on concurrent timeout-workers.
+    How it works
+    ------------
+    - A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
+      gates creation of timeout worker threads. If no permit is available, this call
+      blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
+      > 0 and acquisition takes longer than that, a warning is logged before continuing
+      to wait.
+    - Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
+    - We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
+      `TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
+    - If the worker finishes in time, we return its result or re-raise its exception
+      (with original traceback).
+    Cancellation semantics
+    ----------------------
+    This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
+    are rare this is fine. If timeouts are common, consider moving to:
+      - a shared ThreadPoolExecutor (caps threads and amortizes creation), or
+      - worker process (supports killing in-flight processes)
+    Concurrency control & logging
+    -----------------------------
+    - Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
+    - If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
+      warning and then block until a slot is available.
+    - On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
+      thread sample to help diagnose pressure.
+    Args:
+        func: Synchronous callable to execute.
+        timeout_seconds: Float seconds for the soft timeout (0/None disables).
+        *args, **kwargs: Passed through to `func`.
+    Returns:
+        Whatever `func` returns.
+    Raises:
+        TimeoutError: If `timeout_seconds` elapse before completion.
+        BaseException: If `func` raises, the same exception is re-raised with its
+                       original traceback.
+    """
+    if not timeout_seconds or timeout_seconds <= 0:
+        return func(*args, **kwargs)
+    # try to respect the global cap on concurrent timeout workers
+    warn_after = float(
+        get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
+    )
+    if warn_after > 0:
+        acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
+        if not acquired:
+            logger.warning(
+                "timeout thread limit reached (%d); waiting for a slot...",
+                _MAX_TIMEOUT_THREADS,
+            )
+            _TIMEOUT_SEMA.acquire()
+    else:
+        _TIMEOUT_SEMA.acquire()
+    done = threading.Event()
+    result = {"value": None, "exc": None}
+    def target():
+        try:
+            result["value"] = func(*args, **kwargs)
+        except BaseException as e:
+            result["exc"] = e
+        finally:
+            done.set()
+            _TIMEOUT_SEMA.release()
+    t = threading.Thread(
+        target=target,
+        daemon=True,
+        name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
+    )
+    try:
+        t.start()
+    except BaseException:
+        _TIMEOUT_SEMA.release()
+        raise
+    finished = done.wait(timeout_seconds)
+    if not finished:
+        if (
+            logger.isEnabledFor(logging.DEBUG)
+            and get_settings().DEEPEVAL_VERBOSE_MODE
+        ):
+            names = [th.name for th in threading.enumerate()[:10]]
+            logger.debug(
+                "timeout after %.3fs (active_threads=%d, sample=%s)",
+                timeout_seconds,
+                threading.active_count(),
+                names,
+            )
+        raise _make_timeout_error(timeout_seconds)
+    # Completed within time: return or raise
+    if result["exc"] is not None:
+        exc = result["exc"]
+        raise exc.with_traceback(getattr(exc, "__traceback__", None))
+    return result["value"]
 def create_retry_decorator(provider: Provider):
     """
     Build a Tenacity @retry decorator wired to our dynamic retry policy
     for the given provider slug.
     """
     slug = slugify(provider)
-    return retry(
+    base_retry = retry(
         wait=dynamic_wait(),
         stop=dynamic_stop(),
         retry=dynamic_retry(slug),
         before_sleep=make_before_sleep_log(slug),
         after=make_after_log(slug),
+        reraise=False,
     )
+    def _decorator(func):
+        if inspect.iscoroutinefunction(func):
+            @functools.wraps(func)
+            async def attempt(*args, **kwargs):
+                timeout_seconds = (
+                    get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
+                )
+                coro = func(*args, **kwargs)
+                if timeout_seconds > 0:
+                    try:
+                        return await asyncio.wait_for(coro, timeout_seconds)
+                    except asyncio.TimeoutError as e:
+                        if (
+                            logger.isEnabledFor(logging.DEBUG)
+                            and get_settings().DEEPEVAL_VERBOSE_MODE is True
+                        ):
+                            logger.debug(
+                                "async timeout after %.3fs (active_threads=%d, tasks=%d)",
+                                timeout_seconds,
+                                threading.active_count(),
+                                len(asyncio.all_tasks()),
+                            )
+                        raise _make_timeout_error(timeout_seconds) from e
+                return await coro
+            return base_retry(attempt)
+        @functools.wraps(func)
+        def attempt(*args, **kwargs):
+            timeout_seconds = (
+                get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
+            )
+            if timeout_seconds > 0:
+                return _run_sync_with_timeout(
+                    func, timeout_seconds, *args, **kwargs
+                )
+            return func(*args, **kwargs)
+        return base_retry(attempt)
+    return _decorator
 def _httpx_net_excs() -> tuple[type, ...]:
     try:

deepeval/openai/__init__.py CHANGED Viewed

@@ -1,37 +1,19 @@
-from importlib.machinery import SourceFileLoader
-import importlib.util
-import sys
-from deepeval.openai.patch import patch_openai
+try:
+    import openai  # noqa: F401
+except ImportError:
+    raise ModuleNotFoundError(
+        "Please install OpenAI to use this feature: 'pip install openai'"
+    )
-def load_and_patch_openai():
-    openai_spec = importlib.util.find_spec("openai")
-    if not openai_spec or not openai_spec.origin:
-        raise ImportError("Could not find the OpenAI package")
-    package_dirs = openai_spec.submodule_search_locations
-    loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
-    new_spec = importlib.util.spec_from_loader(
-        "deepeval_openai",
-        loader,
-        origin=openai_spec.origin,
-        is_package=True,
-    )
-    deepeval_openai = importlib.util.module_from_spec(new_spec)
-    deepeval_openai.__path__ = package_dirs
-    sys.modules["deepeval_openai"] = deepeval_openai
-    loader.exec_module(deepeval_openai)
-    patch_openai(deepeval_openai)
-    return deepeval_openai
+try:
+    from openai import OpenAI, AsyncOpenAI  # noqa: F401
+except ImportError:
+    OpenAI = None  # type: ignore
+    AsyncOpenAI = None  # type: ignore
-patched_openai = load_and_patch_openai()
-openai = patched_openai
-OpenAI = patched_openai.OpenAI
-AsyncOpenAI = patched_openai.AsyncOpenAI
+if OpenAI or AsyncOpenAI:
+    from deepeval.openai.patch import patch_openai_classes
-__all__ = [
-    "openai",
-    "OpenAI",
-    "AsyncOpenAI",
-]
+    patch_openai_classes()

deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

deepeval 3.6.5py3-none-any.whl → 3.6.7py3-none-any.whl