PyPI - deepeval - Versions diffs - 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl - Mend

deepeval 3.7.5py3-none-any.whl → 3.7.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

deepeval/_version.py +1 -1
deepeval/cli/main.py +2022 -759
deepeval/cli/utils.py +208 -36
deepeval/config/dotenv_handler.py +19 -0
deepeval/config/settings.py +675 -245
deepeval/config/utils.py +9 -1
deepeval/dataset/api.py +23 -1
deepeval/dataset/golden.py +106 -21
deepeval/evaluate/evaluate.py +0 -3
deepeval/evaluate/execute.py +162 -315
deepeval/evaluate/utils.py +6 -30
deepeval/key_handler.py +124 -51
deepeval/metrics/__init__.py +0 -4
deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
deepeval/metrics/answer_relevancy/template.py +102 -179
deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
deepeval/metrics/arena_g_eval/template.py +17 -1
deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
deepeval/metrics/argument_correctness/template.py +19 -2
deepeval/metrics/base_metric.py +19 -41
deepeval/metrics/bias/bias.py +102 -108
deepeval/metrics/bias/template.py +14 -2
deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
deepeval/metrics/conversation_completeness/template.py +23 -3
deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
deepeval/metrics/conversational_dag/nodes.py +66 -123
deepeval/metrics/conversational_dag/templates.py +16 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
deepeval/metrics/dag/dag.py +10 -0
deepeval/metrics/dag/nodes.py +63 -126
deepeval/metrics/dag/templates.py +14 -0
deepeval/metrics/exact_match/exact_match.py +9 -1
deepeval/metrics/faithfulness/faithfulness.py +82 -136
deepeval/metrics/g_eval/g_eval.py +93 -79
deepeval/metrics/g_eval/template.py +18 -1
deepeval/metrics/g_eval/utils.py +7 -6
deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
deepeval/metrics/goal_accuracy/template.py +21 -3
deepeval/metrics/hallucination/hallucination.py +60 -75
deepeval/metrics/hallucination/template.py +13 -0
deepeval/metrics/indicator.py +11 -10
deepeval/metrics/json_correctness/json_correctness.py +40 -38
deepeval/metrics/json_correctness/template.py +10 -0
deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
deepeval/metrics/knowledge_retention/schema.py +9 -3
deepeval/metrics/knowledge_retention/template.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +72 -43
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
deepeval/metrics/mcp/schema.py +4 -0
deepeval/metrics/mcp/template.py +59 -0
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
deepeval/metrics/mcp_use_metric/template.py +12 -0
deepeval/metrics/misuse/misuse.py +77 -97
deepeval/metrics/misuse/template.py +15 -0
deepeval/metrics/multimodal_metrics/__init__.py +0 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
deepeval/metrics/non_advice/non_advice.py +79 -105
deepeval/metrics/non_advice/template.py +12 -0
deepeval/metrics/pattern_match/pattern_match.py +12 -4
deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
deepeval/metrics/pii_leakage/template.py +14 -0
deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
deepeval/metrics/plan_adherence/template.py +11 -0
deepeval/metrics/plan_quality/plan_quality.py +63 -87
deepeval/metrics/plan_quality/template.py +9 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
deepeval/metrics/prompt_alignment/template.py +12 -0
deepeval/metrics/role_adherence/role_adherence.py +48 -71
deepeval/metrics/role_adherence/template.py +14 -0
deepeval/metrics/role_violation/role_violation.py +75 -108
deepeval/metrics/role_violation/template.py +12 -0
deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
deepeval/metrics/step_efficiency/template.py +11 -0
deepeval/metrics/summarization/summarization.py +115 -183
deepeval/metrics/summarization/template.py +19 -0
deepeval/metrics/task_completion/task_completion.py +67 -73
deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
deepeval/metrics/tool_use/schema.py +4 -0
deepeval/metrics/tool_use/template.py +16 -2
deepeval/metrics/tool_use/tool_use.py +72 -94
deepeval/metrics/topic_adherence/schema.py +4 -0
deepeval/metrics/topic_adherence/template.py +21 -1
deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
deepeval/metrics/toxicity/template.py +13 -0
deepeval/metrics/toxicity/toxicity.py +80 -99
deepeval/metrics/turn_contextual_precision/schema.py +3 -3
deepeval/metrics/turn_contextual_precision/template.py +9 -2
deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
deepeval/metrics/turn_contextual_recall/schema.py +3 -3
deepeval/metrics/turn_contextual_recall/template.py +8 -1
deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
deepeval/metrics/turn_faithfulness/schema.py +1 -1
deepeval/metrics/turn_faithfulness/template.py +8 -1
deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
deepeval/metrics/turn_relevancy/template.py +14 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
deepeval/metrics/utils.py +161 -91
deepeval/models/__init__.py +2 -0
deepeval/models/base_model.py +44 -6
deepeval/models/embedding_models/azure_embedding_model.py +34 -12
deepeval/models/embedding_models/local_embedding_model.py +22 -7
deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
deepeval/models/embedding_models/openai_embedding_model.py +3 -2
deepeval/models/llms/__init__.py +2 -0
deepeval/models/llms/amazon_bedrock_model.py +229 -73
deepeval/models/llms/anthropic_model.py +143 -48
deepeval/models/llms/azure_model.py +169 -95
deepeval/models/llms/constants.py +2032 -0
deepeval/models/llms/deepseek_model.py +82 -35
deepeval/models/llms/gemini_model.py +126 -67
deepeval/models/llms/grok_model.py +128 -65
deepeval/models/llms/kimi_model.py +129 -87
deepeval/models/llms/litellm_model.py +94 -18
deepeval/models/llms/local_model.py +115 -16
deepeval/models/llms/ollama_model.py +97 -76
deepeval/models/llms/openai_model.py +169 -311
deepeval/models/llms/portkey_model.py +58 -16
deepeval/models/llms/utils.py +5 -2
deepeval/models/retry_policy.py +10 -5
deepeval/models/utils.py +56 -4
deepeval/simulator/conversation_simulator.py +49 -2
deepeval/simulator/template.py +16 -1
deepeval/synthesizer/synthesizer.py +19 -17
deepeval/test_case/api.py +24 -45
deepeval/test_case/arena_test_case.py +7 -2
deepeval/test_case/conversational_test_case.py +55 -6
deepeval/test_case/llm_test_case.py +60 -6
deepeval/test_run/api.py +3 -0
deepeval/test_run/test_run.py +6 -1
deepeval/utils.py +26 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
{deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0

deepeval/evaluate/execute.py CHANGED Viewed

@@ -51,20 +51,16 @@ from deepeval.utils import (
     shorten,
     len_medium,
     format_error_text,
+    are_timeouts_disabled,
+    get_per_task_timeout_seconds,
+    get_gather_timeout_seconds,
+    get_gather_timeout,
 )
 from deepeval.telemetry import capture_evaluation_run
 from deepeval.metrics import (
     BaseMetric,
     BaseConversationalMetric,
-    BaseMultimodalMetric,
     TaskCompletionMetric,
-    # RAG metrics that support both single-turn and multimodal
-    ContextualPrecisionMetric,
-    ContextualRecallMetric,
-    ContextualRelevancyMetric,
-    AnswerRelevancyMetric,
-    FaithfulnessMetric,
-    ToolCorrectnessMetric,
 )
 from deepeval.metrics.indicator import (
     measure_metrics_with_indicator,
@@ -116,14 +112,56 @@ from deepeval.test_run.hyperparameters import (
 logger = logging.getLogger(__name__)
-MLLM_SUPPORTED_METRICS = [
-    ContextualPrecisionMetric,
-    ContextualRecallMetric,
-    ContextualRelevancyMetric,
-    AnswerRelevancyMetric,
-    FaithfulnessMetric,
-    ToolCorrectnessMetric,
-]
+def _timeout_msg(action: str, seconds: float) -> str:
+    if are_timeouts_disabled():
+        return (
+            f"Timeout occurred while {action} "
+            "(DeepEval timeouts are disabled; this likely came from the model/provider SDK or network layer). "
+            "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+    return (
+        f"Timed out after {seconds:.2f}s while {action}. "
+        "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+        "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+    )
+def _log_gather_timeout(
+    logger,
+    *,
+    exc: Optional[BaseException] = None,
+    pending: Optional[int] = None,
+) -> None:
+    settings = get_settings()
+    if are_timeouts_disabled():
+        logger.warning(
+            "A task raised %s while waiting for gathered results; DeepEval gather/per-task timeouts are disabled%s. "
+            "This likely came from the model/provider SDK or network layer.",
+            type(exc).__name__ if exc else "TimeoutError",
+            f" (pending={pending})" if pending is not None else "",
+            exc_info=settings.DEEPEVAL_LOG_STACK_TRACES,
+        )
+    else:
+        if pending is not None:
+            logger.warning(
+                "Gather TIMEOUT after %.1fs; pending=%d tasks. "
+                "Some metrics may be marked as timed out. "
+                "To give tasks more time, consider increasing "
+                "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
+                "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
+                get_gather_timeout_seconds(),
+                pending,
+            )
+        else:
+            logger.warning(
+                "gather TIMEOUT after %.1fs. Some metrics may be marked as timed out. "
+                "To give tasks more time, consider increasing "
+                "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
+                "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
+                get_gather_timeout_seconds(),
+            )
 def _skip_metrics_for_error(
@@ -234,18 +272,6 @@ async def _snapshot_tasks():
     return {t for t in asyncio.all_tasks() if t is not cur}
-def _per_task_timeout() -> float:
-    return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-def _gather_timeout() -> float:
-    s = get_settings()
-    return (
-        s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-        + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
-    )
 def filter_duplicate_results(
     main_result: TestResult, results: List[TestResult]
 ) -> List[TestResult]:
@@ -267,6 +293,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
             coro = obj
         else:
             coro = obj(*args, **kwargs)
+        if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
+            return await coro
         return await asyncio.wait_for(coro, timeout=timeout)
     finally:
         reset_outer_deadline(token)
@@ -282,7 +312,6 @@ def execute_test_cases(
     metrics: Union[
         List[BaseMetric],
         List[BaseConversationalMetric],
-        List[BaseMultimodalMetric],
     ],
     error_config: Optional[ErrorConfig] = ErrorConfig(),
     display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -315,17 +344,12 @@ def execute_test_cases(
     conversational_metrics: List[BaseConversationalMetric] = []
     llm_metrics: List[BaseMetric] = []
-    mllm_metrics: List[BaseMultimodalMetric] = []
     for metric in metrics:
         metric.async_mode = False
         if isinstance(metric, BaseMetric):
             llm_metrics.append(metric)
-            if type(metric) in MLLM_SUPPORTED_METRICS:
-                mllm_metrics.append(metric)
         elif isinstance(metric, BaseConversationalMetric):
             conversational_metrics.append(metric)
-        elif isinstance(metric, BaseMultimodalMetric):
-            mllm_metrics.append(metric)
     test_results: List[TestResult] = []
@@ -333,23 +357,17 @@ def execute_test_cases(
         progress: Optional[Progress] = None, pbar_id: Optional[int] = None
     ):
         llm_test_case_count = -1
-        mllm_test_case_count = -1
         conversational_test_case_count = -1
         show_metric_indicator = (
             display_config.show_indicator and not _use_bar_indicator
         )
         for i, test_case in enumerate(test_cases):
             # skip what we know we won't run
-            if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
+            if isinstance(test_case, LLMTestCase):
                 if not llm_metrics:
                     update_pbar(progress, pbar_id)
                     continue
                 per_case_total = len(llm_metrics)
-            elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
-                if not mllm_metrics:
-                    update_pbar(progress, pbar_id)
-                    continue
-                per_case_total = len(mllm_metrics)
             elif isinstance(test_case, ConversationalTestCase):
                 if not conversational_metrics:
                     update_pbar(progress, pbar_id)
@@ -364,56 +382,33 @@ def execute_test_cases(
             metrics_for_case = (
                 llm_metrics
-                if (
-                    isinstance(test_case, LLMTestCase)
-                    and not test_case.multimodal
-                )
-                else (
-                    mllm_metrics
-                    if (
-                        isinstance(test_case, LLMTestCase)
-                        and test_case.multimodal
-                    )
-                    else conversational_metrics
-                )
+                if (isinstance(test_case, LLMTestCase))
+                else conversational_metrics
             )
             api_test_case = create_api_test_case(
                 test_case=test_case,
                 index=(
                     llm_test_case_count + 1
-                    if (
-                        isinstance(test_case, LLMTestCase)
-                        and not test_case.multimodal
-                    )
-                    else (
-                        mllm_test_case_count + 1
-                        if (
-                            isinstance(test_case, LLMTestCase)
-                            and test_case.multimodal
-                        )
-                        else conversational_test_case_count + 1
-                    )
+                    if (isinstance(test_case, LLMTestCase))
+                    else (conversational_test_case_count + 1)
                 ),
             )
             emitted = [False] * len(metrics_for_case)
             index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
             current_index = -1
             start_time = time.perf_counter()
-            deadline_timeout = _per_task_timeout()
+            deadline_timeout = get_per_task_timeout_seconds()
             deadline_token = set_outer_deadline(deadline_timeout)
             new_cached_test_case: CachedTestCase = None
             try:
                 def _run_case():
-                    nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
+                    nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
                     with capture_evaluation_run("test case"):
                         for metric in metrics:
                             metric.error = None  # Reset metric error
-                        if (
-                            isinstance(test_case, LLMTestCase)
-                            and not test_case.multimodal
-                        ):
+                        if isinstance(test_case, LLMTestCase):
                             llm_test_case_count += 1
                             cached_test_case = None
                             if cache_config.use_cache:
@@ -465,29 +460,6 @@ def execute_test_cases(
                                     )
                                 update_pbar(progress, pbar_test_case_id)
-                        # No caching and not sending test cases to Confident AI for multimodal metrics yet
-                        elif (
-                            isinstance(test_case, LLMTestCase)
-                            and test_case.multimodal
-                        ):
-                            mllm_test_case_count += 1
-                            for metric in mllm_metrics:
-                                current_index = index_of[id(metric)]
-                                res = _execute_metric(
-                                    metric=metric,
-                                    test_case=test_case,
-                                    show_metric_indicator=show_metric_indicator,
-                                    in_component=False,
-                                    error_config=error_config,
-                                )
-                                if res == "skip":
-                                    continue
-                                metric_data = create_metric_data(metric)
-                                api_test_case.update_metric_data(metric_data)
-                                emitted[current_index] = True
-                                update_pbar(progress, pbar_test_case_id)
                         # No caching for conversational metrics yet
                         elif isinstance(test_case, ConversationalTestCase):
                             conversational_test_case_count += 1
@@ -510,25 +482,20 @@ def execute_test_cases(
                 run_sync_with_timeout(_run_case, deadline_timeout)
             except (asyncio.TimeoutError, TimeoutError):
-                msg = (
-                    f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
-                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-                )
-                for i, m in enumerate(metrics_for_case):
-                    if getattr(m, "skipped", False):
+                msg = _timeout_msg("evaluating metric", deadline_timeout)
+                for i, metric in enumerate(metrics_for_case):
+                    if metric.skipped:
                         continue
                     # already finished or errored? leave it
-                    if getattr(m, "success", None) is not None or getattr(
-                        m, "error", None
-                    ):
+                    if metric.success is not None or metric.error is not None:
                         continue
                     if i == current_index:
-                        m.success = False
-                        m.error = msg
+                        metric.success = False
+                        metric.error = msg
                     elif i > current_index:
-                        m.success = False
-                        m.error = "Skipped due to case timeout."
+                        metric.success = False
+                        metric.error = "Skipped due to case timeout."
                 if not error_config.ignore_errors:
                     raise
@@ -553,12 +520,12 @@ def execute_test_cases(
                         )
                     # Attach MetricData for *all* metrics (finished or synthesized)
-                    for i, m in enumerate(metrics_for_case):
-                        if getattr(m, "skipped", False):
+                    for i, metric in enumerate(metrics_for_case):
+                        if metric.skipped:
                             continue
                         if not emitted[i]:
                             api_test_case.update_metric_data(
-                                create_metric_data(m)
+                                create_metric_data(metric)
                             )
                     elapsed = time.perf_counter() - start_time
@@ -597,7 +564,6 @@ async def a_execute_test_cases(
     metrics: Union[
         List[BaseMetric],
         List[BaseConversationalMetric],
-        List[BaseMultimodalMetric],
     ],
     error_config: Optional[ErrorConfig] = ErrorConfig(),
     display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -612,9 +578,8 @@ async def a_execute_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            timeout = _per_task_timeout()
             return await _await_with_outer_deadline(
-                func, *args, timeout=timeout, **kwargs
+                func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
             )
     global_test_run_cache_manager.disable_write_cache = (
@@ -631,20 +596,14 @@ async def a_execute_test_cases(
             metric.verbose_mode = display_config.verbose_mode
     llm_metrics: List[BaseMetric] = []
-    mllm_metrics: List[BaseMultimodalMetric] = []
     conversational_metrics: List[BaseConversationalMetric] = []
     for metric in metrics:
         if isinstance(metric, BaseMetric):
             llm_metrics.append(metric)
-            if type(metric) in MLLM_SUPPORTED_METRICS:
-                mllm_metrics.append(metric)
-        elif isinstance(metric, BaseMultimodalMetric):
-            mllm_metrics.append(metric)
         elif isinstance(metric, BaseConversationalMetric):
             conversational_metrics.append(metric)
     llm_test_case_counter = -1
-    mllm_test_case_counter = -1
     conversational_test_case_counter = -1
     test_results: List[Union[TestResult, LLMTestCase]] = []
     tasks = []
@@ -665,10 +624,7 @@ async def a_execute_test_cases(
         with progress:
             for test_case in test_cases:
                 with capture_evaluation_run("test case"):
-                    if (
-                        isinstance(test_case, LLMTestCase)
-                        and not test_case.multimodal
-                    ):
+                    if isinstance(test_case, LLMTestCase):
                         if len(llm_metrics) == 0:
                             update_pbar(progress, pbar_id)
                             continue
@@ -696,31 +652,6 @@ async def a_execute_test_cases(
                         )
                         tasks.append(asyncio.create_task(task))
-                    elif (
-                        isinstance(test_case, LLMTestCase)
-                        and test_case.multimodal
-                    ):
-                        mllm_test_case_counter += 1
-                        copied_multimodal_metrics: List[
-                            BaseMultimodalMetric
-                        ] = copy_metrics(mllm_metrics)
-                        task = execute_with_semaphore(
-                            func=_a_execute_mllm_test_cases,
-                            metrics=copied_multimodal_metrics,
-                            test_case=test_case,
-                            test_run_manager=test_run_manager,
-                            test_results=test_results,
-                            count=mllm_test_case_counter,
-                            ignore_errors=error_config.ignore_errors,
-                            skip_on_missing_params=error_config.skip_on_missing_params,
-                            show_indicator=display_config.show_indicator,
-                            _use_bar_indicator=_use_bar_indicator,
-                            _is_assert_test=_is_assert_test,
-                            progress=progress,
-                            pbar_id=pbar_id,
-                        )
-                        tasks.append(asyncio.create_task(task))
                     elif isinstance(test_case, ConversationalTestCase):
                         conversational_test_case_counter += 1
@@ -746,27 +677,23 @@ async def a_execute_test_cases(
             try:
                 await asyncio.wait_for(
                     asyncio.gather(*tasks),
-                    timeout=_gather_timeout(),
+                    timeout=get_gather_timeout(),
                 )
-            except (asyncio.TimeoutError, TimeoutError):
+            except (asyncio.TimeoutError, TimeoutError) as e:
                 for t in tasks:
                     if not t.done():
                         t.cancel()
                 await asyncio.gather(*tasks, return_exceptions=True)
-                logging.getLogger("deepeval").error(
-                    "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
-                    _gather_timeout(),
-                )
+                _log_gather_timeout(logger, exc=e)
                 if not error_config.ignore_errors:
                     raise
     else:
         for test_case in test_cases:
             with capture_evaluation_run("test case"):
-                if (
-                    isinstance(test_case, LLMTestCase)
-                    and not test_case.multimodal
-                ):
+                if isinstance(test_case, LLMTestCase):
                     if len(llm_metrics) == 0:
                         continue
                     llm_test_case_counter += 1
@@ -814,34 +741,12 @@ async def a_execute_test_cases(
                     )
                     tasks.append(asyncio.create_task((task)))
-                elif (
-                    isinstance(test_case, LLMTestCase) and test_case.multimodal
-                ):
-                    mllm_test_case_counter += 1
-                    copied_multimodal_metrics: List[BaseMultimodalMetric] = (
-                        copy_metrics(mllm_metrics)
-                    )
-                    task = execute_with_semaphore(
-                        func=_a_execute_mllm_test_cases,
-                        metrics=copied_multimodal_metrics,
-                        test_case=test_case,
-                        test_run_manager=test_run_manager,
-                        test_results=test_results,
-                        count=mllm_test_case_counter,
-                        ignore_errors=error_config.ignore_errors,
-                        skip_on_missing_params=error_config.skip_on_missing_params,
-                        _use_bar_indicator=_use_bar_indicator,
-                        _is_assert_test=_is_assert_test,
-                        show_indicator=display_config.show_indicator,
-                    )
-                    tasks.append(asyncio.create_task(task))
                 await asyncio.sleep(async_config.throttle_value)
         try:
             await asyncio.wait_for(
                 asyncio.gather(*tasks),
-                timeout=_gather_timeout(),
+                timeout=get_gather_timeout(),
             )
         except (asyncio.TimeoutError, TimeoutError):
             # Cancel any still-pending tasks and drain them
@@ -910,11 +815,18 @@ async def _a_execute_llm_test_cases(
             progress=progress,
         )
     except asyncio.CancelledError:
-        msg = (
-            "Timed out/cancelled while evaluating metric. "
-            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-        )
+        if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
+            msg = (
+                "Cancelled while evaluating metric. "
+                "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
+                "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
+        else:
+            msg = (
+                "Timed out/cancelled while evaluating metric. "
+                "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
         for m in metrics:
             if getattr(m, "skipped", False):
                 continue
@@ -976,85 +888,8 @@ async def _a_execute_llm_test_cases(
         update_pbar(progress, pbar_id)
-async def _a_execute_mllm_test_cases(
-    metrics: List[BaseMultimodalMetric],
-    test_case: LLMTestCase,
-    test_run_manager: TestRunManager,
-    test_results: List[Union[TestResult, LLMTestCase]],
-    count: int,
-    ignore_errors: bool,
-    skip_on_missing_params: bool,
-    show_indicator: bool,
-    _use_bar_indicator: bool,
-    _is_assert_test: bool,
-    progress: Optional[Progress] = None,
-    pbar_id: Optional[int] = None,
-):
-    show_metrics_indicator = show_indicator and not _use_bar_indicator
-    pbar_test_case_id = add_pbar(
-        progress,
-        f"    🎯 Evaluating test case #{count}",
-        total=len(metrics),
-    )
-    for metric in metrics:
-        metric.skipped = False
-        metric.error = None  # Reset metric error
-    api_test_case: LLMApiTestCase = create_api_test_case(
-        test_case=test_case, index=count if not _is_assert_test else None
-    )
-    test_start_time = time.perf_counter()
-    try:
-        await measure_metrics_with_indicator(
-            metrics=metrics,
-            test_case=test_case,
-            cached_test_case=None,
-            skip_on_missing_params=skip_on_missing_params,
-            ignore_errors=ignore_errors,
-            show_indicator=show_metrics_indicator,
-            pbar_eval_id=pbar_test_case_id,
-            progress=progress,
-        )
-    except asyncio.CancelledError:
-        msg = (
-            "Timed out/cancelled while evaluating metric. "
-            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-        )
-        for m in metrics:
-            if getattr(m, "skipped", False):
-                continue
-            # If the task never finished and didn't set a terminal state, mark it now
-            if getattr(m, "success", None) is None and not getattr(
-                m, "error", None
-            ):
-                m.success = False
-                m.error = msg
-        if not ignore_errors:
-            raise
-    finally:
-        for metric in metrics:
-            if metric.skipped:
-                continue
-            metric_data = create_metric_data(metric)
-            api_test_case.update_metric_data(metric_data)
-        test_end_time = time.perf_counter()
-        run_duration = test_end_time - test_start_time
-        api_test_case.update_run_duration(run_duration)
-        ### Update Test Run ###
-        test_run_manager.update_test_run(api_test_case, test_case)
-        test_results.append(create_test_result(api_test_case))
-        update_pbar(progress, pbar_id)
 async def _a_execute_conversational_test_cases(
-    metrics: List[
-        Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
-    ],
+    metrics: List[Union[BaseMetric, BaseConversationalMetric]],
     test_case: ConversationalTestCase,
     test_run_manager: TestRunManager,
     test_results: List[Union[TestResult, LLMTestCase]],
@@ -1097,11 +932,18 @@ async def _a_execute_conversational_test_cases(
         )
     except asyncio.CancelledError:
-        msg = (
-            "Timed out/cancelled while evaluating metric. "
-            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-        )
+        if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
+            msg = (
+                "Cancelled while evaluating metric. "
+                "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
+                "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
+        else:
+            msg = (
+                "Timed out/cancelled while evaluating metric. "
+                "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
         for m in metrics:
             if getattr(m, "skipped", False):
                 continue
@@ -1211,7 +1053,7 @@ def execute_agentic_test_cases(
                             loop.run_until_complete(
                                 _await_with_outer_deadline(
                                     coro,
-                                    timeout=_per_task_timeout(),
+                                    timeout=get_per_task_timeout_seconds(),
                                 )
                             )
                         else:
@@ -1538,17 +1380,13 @@ def execute_agentic_test_cases(
             # run the golden with a timeout
             start_time = time.perf_counter()
-            deadline = _per_task_timeout()
+            deadline = get_per_task_timeout_seconds()
             try:
                 run_sync_with_timeout(_run_golden, deadline)
             except (asyncio.TimeoutError, TimeoutError):
                 # mark any not yet finished trace level and span level metrics as timed out.
-                msg = (
-                    f"Timed out after {deadline:.2f}s while executing agentic test case. "
-                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-                )
+                msg = _timeout_msg("executing agentic test case", deadline)
                 if current_trace is not None:
                     # Trace-level metrics
@@ -1729,9 +1567,8 @@ async def a_execute_agentic_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            timeout = _per_task_timeout()
             return await _await_with_outer_deadline(
-                func, *args, timeout=timeout, **kwargs
+                func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
             )
     test_run_manager = global_test_run_manager
@@ -1782,7 +1619,7 @@ async def a_execute_agentic_test_cases(
             try:
                 await asyncio.wait_for(
                     asyncio.gather(*tasks),
-                    timeout=_gather_timeout(),
+                    timeout=get_gather_timeout(),
                 )
             except (asyncio.TimeoutError, TimeoutError):
                 # Cancel any still-pending tasks and drain them
@@ -1863,7 +1700,7 @@ async def _a_execute_agentic_test_case(
                     await _await_with_outer_deadline(
                         observed_callback,
                         golden.input,
-                        timeout=_per_task_timeout(),
+                        timeout=get_per_task_timeout_seconds(),
                     )
                 else:
                     observed_callback(golden.input)
@@ -1957,7 +1794,7 @@ async def _a_execute_agentic_test_case(
                 try:
                     await asyncio.wait_for(
                         asyncio.gather(*child_tasks),
-                        timeout=_gather_timeout(),
+                        timeout=get_gather_timeout(),
                     )
                 except (asyncio.TimeoutError, TimeoutError):
                     for t in child_tasks:
@@ -1980,11 +1817,18 @@ async def _a_execute_agentic_test_case(
                     )
     except asyncio.CancelledError:
         # mark any unfinished metrics as cancelled
-        cancel_msg = (
-            "Timed out/cancelled while evaluating agentic test case. "
-            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
-            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
-        )
+        if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
+            cancel_msg = (
+                "Cancelled while evaluating agentic test case. "
+                "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
+                "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
+        else:
+            cancel_msg = (
+                "Timed out/cancelled while evaluating agentic test case. "
+                "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+            )
         if trace_metrics:
             for m in trace_metrics:
@@ -2676,8 +2520,9 @@ def a_execute_agentic_test_cases_from_loop(
     async def execute_callback_with_semaphore(coroutine: Awaitable):
         async with semaphore:
-            timeout = _per_task_timeout()
-            return await _await_with_outer_deadline(coroutine, timeout=timeout)
+            return await _await_with_outer_deadline(
+                coroutine, timeout=get_per_task_timeout_seconds()
+            )
     def evaluate_test_cases(
         progress: Optional[Progress] = None,
@@ -2899,15 +2744,18 @@ def a_execute_agentic_test_cases_from_loop(
                 loop.run_until_complete(
                     asyncio.wait_for(
                         asyncio.gather(*created_tasks, return_exceptions=True),
-                        timeout=_gather_timeout(),
+                        timeout=get_gather_timeout(),
                     )
                 )
-            except (asyncio.TimeoutError, TimeoutError):
+            except (asyncio.TimeoutError, TimeoutError) as e:
                 import traceback
+                settings = get_settings()
                 pending = [t for t in created_tasks if not t.done()]
+                _log_gather_timeout(logger, exc=e, pending=len(pending))
                 # Log the elapsed time for each task that was pending
                 for t in pending:
                     meta = task_meta.get(t, {})
@@ -2915,26 +2763,27 @@ def a_execute_agentic_test_cases_from_loop(
                     elapsed_time = time.perf_counter() - start_time
                     # Determine if it was a per task or gather timeout based on task's elapsed time
-                    if elapsed_time >= _per_task_timeout():
-                        timeout_type = "per-task"
+                    if not settings.DEEPEVAL_DISABLE_TIMEOUTS:
+                        timeout_type = (
+                            "per-task"
+                            if elapsed_time >= get_per_task_timeout_seconds()
+                            else "gather"
+                        )
+                        logger.info(
+                            "  - PENDING %s elapsed_time=%.2fs timeout_type=%s meta=%s",
+                            t.get_name(),
+                            elapsed_time,
+                            timeout_type,
+                            meta,
+                        )
                     else:
-                        timeout_type = "gather"
-                    logger.warning(
-                        f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
-                        f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
-                        f"To give tasks more time, consider increasing "
-                        f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
-                        f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
-                    )
+                        logger.info(
+                            "  - PENDING %s elapsed_time=%.2fs meta=%s",
+                            t.get_name(),
+                            elapsed_time,
+                            meta,
+                        )
-                    # Log pending tasks and their stack traces
-                    logger.info(
-                        "  - PENDING %s elapsed_time=%.2fs meta=%s",
-                        t.get_name(),
-                        elapsed_time,
-                        meta,
-                    )
                     if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
                         frames = t.get_stack(limit=6)
                         if frames:
@@ -3116,9 +2965,8 @@ async def _a_evaluate_traces(
     async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            timeout = _per_task_timeout()
             return await _await_with_outer_deadline(
-                func, *args, timeout=timeout, **kwargs
+                func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
             )
     eval_tasks = []
@@ -3166,7 +3014,7 @@ async def _a_evaluate_traces(
     try:
         await asyncio.wait_for(
             asyncio.gather(*eval_tasks),
-            timeout=_gather_timeout(),
+            timeout=get_gather_timeout(),
         )
     except (asyncio.TimeoutError, TimeoutError):
         for t in eval_tasks:
@@ -3196,9 +3044,8 @@ async def _evaluate_test_case_pairs(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            timeout = _per_task_timeout()
             return await _await_with_outer_deadline(
-                func, *args, timeout=timeout, **kwargs
+                func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
             )
     tasks = []
@@ -3236,7 +3083,7 @@ async def _evaluate_test_case_pairs(
     try:
         await asyncio.wait_for(
             asyncio.gather(*tasks),
-            timeout=_gather_timeout(),
+            timeout=get_gather_timeout(),
         )
     except (asyncio.TimeoutError, TimeoutError):
         # Cancel any still-pending tasks and drain them

deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

deepeval 3.7.5py3-none-any.whl → 3.7.7py3-none-any.whl