PyPI - deepeval - Versions diffs - 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl - Mend

deepeval 3.6.8py3-none-any.whl → 3.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

deepeval/_version.py +1 -1
deepeval/anthropic/__init__.py +19 -0
deepeval/anthropic/extractors.py +94 -0
deepeval/anthropic/patch.py +169 -0
deepeval/anthropic/utils.py +225 -0
deepeval/benchmarks/drop/drop.py +40 -14
deepeval/benchmarks/ifeval/ifeval.py +2 -2
deepeval/confident/types.py +4 -2
deepeval/config/settings.py +258 -47
deepeval/config/settings_manager.py +4 -0
deepeval/config/utils.py +5 -0
deepeval/dataset/dataset.py +162 -30
deepeval/dataset/utils.py +41 -13
deepeval/evaluate/execute.py +1099 -633
deepeval/integrations/crewai/handler.py +36 -0
deepeval/integrations/langchain/callback.py +27 -2
deepeval/integrations/llama_index/handler.py +58 -4
deepeval/integrations/llama_index/utils.py +24 -0
deepeval/metrics/__init__.py +5 -0
deepeval/metrics/exact_match/__init__.py +0 -0
deepeval/metrics/exact_match/exact_match.py +94 -0
deepeval/metrics/indicator.py +21 -1
deepeval/metrics/pattern_match/__init__.py +0 -0
deepeval/metrics/pattern_match/pattern_match.py +103 -0
deepeval/metrics/task_completion/task_completion.py +9 -2
deepeval/model_integrations/__init__.py +0 -0
deepeval/model_integrations/utils.py +116 -0
deepeval/models/base_model.py +3 -1
deepeval/models/llms/amazon_bedrock_model.py +20 -17
deepeval/models/llms/openai_model.py +10 -1
deepeval/models/retry_policy.py +103 -20
deepeval/openai/__init__.py +3 -1
deepeval/openai/extractors.py +2 -2
deepeval/openai/utils.py +7 -31
deepeval/prompt/api.py +11 -10
deepeval/prompt/prompt.py +5 -4
deepeval/simulator/conversation_simulator.py +25 -18
deepeval/synthesizer/chunking/context_generator.py +9 -1
deepeval/telemetry.py +3 -3
deepeval/test_case/llm_test_case.py +3 -2
deepeval/test_run/api.py +3 -2
deepeval/test_run/cache.py +4 -3
deepeval/test_run/test_run.py +24 -5
deepeval/tracing/api.py +11 -10
deepeval/tracing/otel/exporter.py +11 -0
deepeval/tracing/patchers.py +102 -1
deepeval/tracing/trace_context.py +13 -4
deepeval/tracing/tracing.py +10 -1
deepeval/tracing/types.py +8 -8
deepeval/tracing/utils.py +9 -0
deepeval/utils.py +44 -2
{deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
{deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
/deepeval/{openai → model_integrations}/types.py +0 -0
{deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
{deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0

deepeval/evaluate/execute.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import logging
 from rich.progress import (
@@ -56,10 +57,16 @@ from deepeval.metrics import (
     BaseMetric,
     BaseConversationalMetric,
     BaseMultimodalMetric,
+    TaskCompletionMetric,
 )
 from deepeval.metrics.indicator import (
     measure_metrics_with_indicator,
 )
+from deepeval.models.retry_policy import (
+    set_outer_deadline,
+    reset_outer_deadline,
+    run_sync_with_timeout,
+)
 from deepeval.test_case import (
     LLMTestCase,
     ConversationalTestCase,
@@ -238,6 +245,18 @@ def filter_duplicate_results(
     ]
+async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
+    token = set_outer_deadline(timeout)
+    try:
+        if inspect.isawaitable(obj):
+            coro = obj
+        else:
+            coro = obj(*args, **kwargs)
+        return await asyncio.wait_for(coro, timeout=timeout)
+    finally:
+        reset_outer_deadline(token)
 ###########################################
 ### E2E Evals #############################
 ###########################################
@@ -269,6 +288,13 @@ def execute_test_cases(
     test_run_manager.save_to_disk = cache_config.write_cache
     test_run = test_run_manager.get_test_run(identifier=identifier)
+    if test_run is None:
+        # ensure we have a test_run ( in case it couldn't be loaded from disk )
+        test_run_manager.create_test_run(identifier=identifier)
+        test_run = test_run_manager.get_test_run(identifier=identifier)
+    # capture once for inner closures
+    hyperparameters = test_run.hyperparameters if test_run is not None else None
     if display_config.verbose_mode is not None:
         for metric in metrics:
@@ -289,176 +315,228 @@ def execute_test_cases(
     test_results: List[TestResult] = []
     def evaluate_test_cases(
-        progress: Optional[Progress] = None, pbar_id: Optional[str] = None
+        progress: Optional[Progress] = None, pbar_id: Optional[int] = None
     ):
         llm_test_case_count = -1
+        mllm_test_case_count = -1
         conversational_test_case_count = -1
         show_metric_indicator = (
             display_config.show_indicator and not _use_bar_indicator
         )
         for i, test_case in enumerate(test_cases):
+            # skip what we know we won't run
+            if isinstance(test_case, LLMTestCase):
+                if not llm_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(llm_metrics)
+            elif isinstance(test_case, MLLMTestCase):
+                if not mllm_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(mllm_metrics)
+            elif isinstance(test_case, ConversationalTestCase):
+                if not conversational_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(conversational_metrics)
             pbar_test_case_id = add_pbar(
                 progress,
                 f"    🎯 Evaluating test case #{i}",
-                total=len(metrics),
+                total=per_case_total,
             )
-            with capture_evaluation_run("test case"):
-                for metric in metrics:
-                    metric.error = None  # Reset metric error
-                if isinstance(test_case, LLMTestCase):
-                    if len(llm_metrics) == 0:
-                        continue
-                    llm_test_case_count += 1
-                    cached_test_case = None
-                    if cache_config.use_cache:
-                        cached_test_case = (
-                            global_test_run_cache_manager.get_cached_test_case(
-                                test_case, test_run.hyperparameters
-                            )
-                        )
-                    ##### Metric Calculation #####
-                    api_test_case: LLMApiTestCase = create_api_test_case(
-                        test_case=test_case, index=llm_test_case_count
+            metrics_for_case = (
+                llm_metrics
+                if isinstance(test_case, LLMTestCase)
+                else (
+                    mllm_metrics
+                    if isinstance(test_case, MLLMTestCase)
+                    else conversational_metrics
+                )
+            )
+            api_test_case = create_api_test_case(
+                test_case=test_case,
+                index=(
+                    llm_test_case_count + 1
+                    if isinstance(test_case, LLMTestCase)
+                    else (
+                        mllm_test_case_count + 1
+                        if isinstance(test_case, MLLMTestCase)
+                        else conversational_test_case_count + 1
                     )
-                    new_cached_test_case: CachedTestCase = CachedTestCase()
-                    test_start_time = time.perf_counter()
-                    read_all_metrics_from_cache = True
-                    for metric in llm_metrics:
-                        metric_data = None
-                        if cached_test_case is not None:
-                            cached_metric_data = Cache.get_metric_data(
-                                metric, cached_test_case
-                            )
-                            if cached_metric_data:
-                                metric_data = cached_metric_data.metric_data
-                        if metric_data is None:
-                            read_all_metrics_from_cache = False
-                            res = _execute_metric(
-                                metric=metric,
-                                test_case=test_case,
-                                show_metric_indicator=show_metric_indicator,
-                                in_component=False,
-                                error_config=error_config,
-                            )
-                            if res == "skip":
-                                continue
-                            metric_data = create_metric_data(metric)
-                        # here, we will check for an additional property on the flattened test cases to see if updating is necessary
-                        api_test_case.update_metric_data(metric_data)
-                        if metric.error is None:
-                            cache_metric_data = deepcopy(metric_data)
-                            cache_metric_data.evaluation_cost = 0  # Cached metrics will have evaluation cost as 0, not None.
-                            updated_cached_metric_data = CachedMetricData(
-                                metric_data=cache_metric_data,
-                                metric_configuration=Cache.create_metric_configuration(
-                                    metric
-                                ),
-                            )
-                            new_cached_test_case.cached_metrics_data.append(
-                                updated_cached_metric_data
-                            )
-                        update_pbar(progress, pbar_test_case_id)
+                ),
+            )
+            emitted = [False] * len(metrics_for_case)
+            index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
+            current_index = -1
+            start_time = time.perf_counter()
+            deadline_timeout = _per_task_timeout()
+            deadline_token = set_outer_deadline(deadline_timeout)
+            new_cached_test_case: CachedTestCase = None
+            try:
-                    test_end_time = time.perf_counter()
-                    if read_all_metrics_from_cache:
-                        run_duration = 0
-                    else:
-                        run_duration = test_end_time - test_start_time
-                    api_test_case.update_run_duration(run_duration)
+                def _run_case():
+                    nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
+                    with capture_evaluation_run("test case"):
+                        for metric in metrics:
+                            metric.error = None  # Reset metric error
+                        if isinstance(test_case, LLMTestCase):
+                            llm_test_case_count += 1
+                            cached_test_case = None
+                            if cache_config.use_cache:
+                                cached_test_case = global_test_run_cache_manager.get_cached_test_case(
+                                    test_case, hyperparameters
+                                )
-                    ### Update Test Run ###
-                    test_run_manager.update_test_run(api_test_case, test_case)
+                            ##### Metric Calculation #####
+                            new_cached_test_case = CachedTestCase()
-                    ### Cache Test Run ###
-                    global_test_run_cache_manager.cache_test_case(
-                        test_case,
-                        new_cached_test_case,
-                        test_run.hyperparameters,
-                    )
-                    global_test_run_cache_manager.cache_test_case(
-                        test_case,
-                        new_cached_test_case,
-                        test_run.hyperparameters,
-                        to_temp=True,
-                    )
+                            for metric in llm_metrics:
+                                current_index = index_of[id(metric)]
+                                metric_data = None
+                                if cached_test_case is not None:
+                                    cached_metric_data = Cache.get_metric_data(
+                                        metric, cached_test_case
+                                    )
+                                    if cached_metric_data:
+                                        metric_data = (
+                                            cached_metric_data.metric_data
+                                        )
-                # No caching and not sending test cases to Confident AI for multimodal metrics yet
-                elif isinstance(test_case, MLLMTestCase):
-                    if len(mllm_metrics) == 0:
-                        continue
+                                if metric_data is None:
+                                    res = _execute_metric(
+                                        metric=metric,
+                                        test_case=test_case,
+                                        show_metric_indicator=show_metric_indicator,
+                                        in_component=False,
+                                        error_config=error_config,
+                                    )
+                                    if res == "skip":
+                                        continue
+                                    metric_data = create_metric_data(metric)
-                    api_test_case: LLMApiTestCase = create_api_test_case(
-                        test_case=test_case, index=llm_test_case_count
-                    )
-                    test_start_time = time.perf_counter()
-                    for metric in mllm_metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=False,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
+                                # here, we will check for an additional property on the flattened test cases to see if updating is necessary
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                if metric.error is None:
+                                    cache_metric_data = deepcopy(metric_data)
+                                    cache_metric_data.evaluation_cost = 0  # Cached metrics will have evaluation cost as 0, not None.
+                                    updated_cached_metric_data = CachedMetricData(
+                                        metric_data=cache_metric_data,
+                                        metric_configuration=Cache.create_metric_configuration(
+                                            metric
+                                        ),
+                                    )
+                                    new_cached_test_case.cached_metrics_data.append(
+                                        updated_cached_metric_data
+                                    )
+                                update_pbar(progress, pbar_test_case_id)
-                        metric_data = create_metric_data(metric)
-                        api_test_case.update_metric_data(metric_data)
-                        update_pbar(progress, pbar_test_case_id)
+                        # No caching and not sending test cases to Confident AI for multimodal metrics yet
+                        elif isinstance(test_case, MLLMTestCase):
+                            mllm_test_case_count += 1
+                            for metric in mllm_metrics:
+                                current_index = index_of[id(metric)]
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=False,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
-                    test_end_time = time.perf_counter()
-                    if len(mllm_metrics) > 0:
-                        run_duration = test_end_time - test_start_time
-                        api_test_case.update_run_duration(run_duration)
+                                metric_data = create_metric_data(metric)
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                update_pbar(progress, pbar_test_case_id)
-                    ### Update Test Run ###
-                    test_run_manager.update_test_run(api_test_case, test_case)
+                        # No caching for conversational metrics yet
+                        elif isinstance(test_case, ConversationalTestCase):
+                            conversational_test_case_count += 1
+                            for metric in conversational_metrics:
+                                current_index = index_of[id(metric)]
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=False,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
-                # No caching for conversational metrics yet
-                elif isinstance(test_case, ConversationalTestCase):
-                    if len(metrics) == 0:
+                                metric_data = create_metric_data(metric)
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                update_pbar(progress, pbar_test_case_id)
+                run_sync_with_timeout(_run_case, deadline_timeout)
+            except (asyncio.TimeoutError, TimeoutError):
+                msg = (
+                    f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
+                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+                )
+                for i, m in enumerate(metrics_for_case):
+                    if getattr(m, "skipped", False):
+                        continue
+                    # already finished or errored? leave it
+                    if getattr(m, "success", None) is not None or getattr(
+                        m, "error", None
+                    ):
                         continue
+                    if i == current_index:
+                        m.success = False
+                        m.error = msg
+                    elif i > current_index:
+                        m.success = False
+                        m.error = "Skipped due to case timeout."
+                if not error_config.ignore_errors:
+                    raise
-                    conversational_test_case_count += 1
-                    api_test_case: ConversationalApiTestCase = (
-                        create_api_test_case(
-                            test_case=test_case,
-                            index=conversational_test_case_count,
+            finally:
+                try:
+                    if (
+                        isinstance(test_case, LLMTestCase)
+                        and new_cached_test_case is not None
+                    ):
+                        ### Cache Test Run ###
+                        global_test_run_cache_manager.cache_test_case(
+                            test_case,
+                            new_cached_test_case,
+                            hyperparameters,
                         )
-                    )
-                    test_start_time = time.perf_counter()
-                    for metric in metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=False,
-                            error_config=error_config,
+                        global_test_run_cache_manager.cache_test_case(
+                            test_case,
+                            new_cached_test_case,
+                            hyperparameters,
+                            to_temp=True,
                         )
-                        if res == "skip":
-                            continue
-                        metric_data = create_metric_data(metric)
-                        api_test_case.update_metric_data(metric_data)
-                        update_pbar(progress, pbar_test_case_id)
-                    test_end_time = time.perf_counter()
-                    run_duration = test_end_time - test_start_time
-                    api_test_case.update_run_duration(run_duration)
+                    # Attach MetricData for *all* metrics (finished or synthesized)
+                    for i, m in enumerate(metrics_for_case):
+                        if getattr(m, "skipped", False):
+                            continue
+                        if not emitted[i]:
+                            api_test_case.update_metric_data(
+                                create_metric_data(m)
+                            )
-                    ### Update Test Run ###
+                    elapsed = time.perf_counter() - start_time
+                    api_test_case.update_run_duration(
+                        elapsed if elapsed >= 0 else deadline_timeout
+                    )
                     test_run_manager.update_test_run(api_test_case, test_case)
-                test_result = create_test_result(api_test_case)
-                test_results.append(test_result)
-                update_pbar(progress, pbar_id)
+                    test_results.append(create_test_result(api_test_case))
+                    update_pbar(progress, pbar_id)
+                finally:
+                    reset_outer_deadline(deadline_token)
     if display_config.show_indicator and _use_bar_indicator:
         progress = Progress(
@@ -503,9 +581,9 @@ async def a_execute_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await asyncio.wait_for(
-                func(*args, **kwargs),
-                timeout=_per_task_timeout(),
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
             )
     global_test_run_cache_manager.disable_write_cache = (
@@ -609,7 +687,7 @@ async def a_execute_test_cases(
                         task = execute_with_semaphore(
                             func=_a_execute_conversational_test_cases,
-                            metrics=copy_metrics(metrics),
+                            metrics=copy_metrics(conversational_metrics),
                             test_case=test_case,
                             test_run_manager=test_run_manager,
                             test_results=test_results,
@@ -631,13 +709,15 @@ async def a_execute_test_cases(
                     asyncio.gather(*tasks),
                     timeout=_gather_timeout(),
                 )
-            except asyncio.TimeoutError:
-                # Cancel any still-pending tasks and drain them
+            except (asyncio.TimeoutError, TimeoutError):
                 for t in tasks:
                     if not t.done():
                         t.cancel()
                 await asyncio.gather(*tasks, return_exceptions=True)
-                raise
+                logging.getLogger("deepeval").error(
+                    "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
+                    _gather_timeout(),
+                )
     else:
         for test_case in test_cases:
@@ -717,7 +797,7 @@ async def a_execute_test_cases(
                 asyncio.gather(*tasks),
                 timeout=_gather_timeout(),
             )
-        except asyncio.TimeoutError:
+        except (asyncio.TimeoutError, TimeoutError):
             # Cancel any still-pending tasks and drain them
             for t in tasks:
                 if not t.done():
@@ -744,6 +824,7 @@ async def _a_execute_llm_test_cases(
     progress: Optional[Progress] = None,
     pbar_id: Optional[int] = None,
 ):
+    logger.info("in _a_execute_llm_test_cases")
     pbar_test_case_id = add_pbar(
         progress,
         f"    🎯 Evaluating test case #{count}",
@@ -767,64 +848,85 @@ async def _a_execute_llm_test_cases(
     api_test_case = create_api_test_case(
         test_case=test_case, index=count if not _is_assert_test else None
     )
-    new_cached_test_case: CachedTestCase = CachedTestCase()
-    test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=cached_test_case,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
+    try:
+        new_cached_test_case: CachedTestCase = CachedTestCase()
+        test_start_time = time.perf_counter()
-    for metric in metrics:
-        if metric.skipped:
-            continue
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=cached_test_case,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
-        if metric.error is None:
-            cache_metric_data = deepcopy(metric_data)
-            cache_metric_data.evaluation_cost = (
-                0  # Create new copy and save 0 for cost
-            )
-            updated_cached_metric_data = CachedMetricData(
-                metric_data=cache_metric_data,
-                metric_configuration=Cache.create_metric_configuration(metric),
-            )
-            new_cached_test_case.cached_metrics_data.append(
-                updated_cached_metric_data
-            )
+            if metric.error is None:
+                cache_metric_data = deepcopy(metric_data)
+                cache_metric_data.evaluation_cost = (
+                    0  # Create new copy and save 0 for cost
+                )
+                updated_cached_metric_data = CachedMetricData(
+                    metric_data=cache_metric_data,
+                    metric_configuration=Cache.create_metric_configuration(
+                        metric
+                    ),
+                )
+                new_cached_test_case.cached_metrics_data.append(
+                    updated_cached_metric_data
+                )
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
-    # Quick hack to check if all metrics were from cache
-    if run_duration < 1:
-        run_duration = 0
-    api_test_case.update_run_duration(run_duration)
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
-    ### Cache Test Run ###
-    global_test_run_cache_manager.cache_test_case(
-        test_case,
-        new_cached_test_case,
-        test_run.hyperparameters,
-    )
-    global_test_run_cache_manager.cache_test_case(
-        test_case,
-        new_cached_test_case,
-        test_run.hyperparameters,
-        to_temp=True,
-    )
+        test_end_time = time.perf_counter()
+        run_duration = test_end_time - test_start_time
+        # Quick hack to check if all metrics were from cache
+        if run_duration < 1:
+            run_duration = 0
+        api_test_case.update_run_duration(run_duration)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        ### Cache Test Run ###
+        global_test_run_cache_manager.cache_test_case(
+            test_case,
+            new_cached_test_case,
+            test_run.hyperparameters,
+        )
+        global_test_run_cache_manager.cache_test_case(
+            test_case,
+            new_cached_test_case,
+            test_run.hyperparameters,
+            to_temp=True,
+        )
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 async def _a_execute_mllm_test_cases(
@@ -856,31 +958,50 @@ async def _a_execute_mllm_test_cases(
         test_case=test_case, index=count if not _is_assert_test else None
     )
     test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=None,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
-    for metric in metrics:
-        if metric.skipped:
-            continue
+    try:
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=None,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
-    api_test_case.update_run_duration(run_duration)
+        test_end_time = time.perf_counter()
+        run_duration = test_end_time - test_start_time
+        api_test_case.update_run_duration(run_duration)
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 async def _a_execute_conversational_test_cases(
@@ -915,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
     )
     test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=None,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
-    for metric in metrics:
-        if metric.skipped:
-            continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+    try:
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=None,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
-    test_end_time = time.perf_counter()
-    if len(metrics) > 0:
-        run_duration = test_end_time - test_start_time
-        api_test_case.update_run_duration(run_duration)
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
+        test_end_time = time.perf_counter()
+        if len(metrics) > 0:
+            run_duration = test_end_time - test_start_time
+            api_test_case.update_run_duration(run_duration)
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 ###########################################
@@ -965,7 +1108,11 @@ def execute_agentic_test_cases(
     test_run_manager = global_test_run_manager
     test_run_manager.save_to_disk = cache_config.write_cache
-    test_run_manager.get_test_run(identifier=identifier)
+    test_run = test_run_manager.get_test_run(identifier=identifier)
+    if test_run is None:
+        # Create if not found
+        test_run_manager.create_test_run(identifier=identifier)
+        test_run = test_run_manager.get_test_run(identifier=identifier)
     local_trace_manager = trace_manager
     local_trace_manager.evaluating = True
@@ -975,152 +1122,137 @@ def execute_agentic_test_cases(
         progress: Optional[Progress] = None,
         pbar_id: Optional[int] = None,
     ):
-        count = 0
+        count = -1
         show_metric_indicator = (
             display_config.show_indicator and not _use_bar_indicator
         )
         for golden in goldens:
-            with capture_evaluation_run("golden"):
-                count += 1
-                total_tags = count_observe_decorators_in_module(
-                    observed_callback
-                )
-                pbar_tags_id = add_pbar(
-                    progress,
-                    f"     ⚡ Invoking observed callback (#{count})",
-                    total=total_tags,
-                )
-                with Observer(
-                    "custom",
-                    func_name="Test Wrapper",
-                    _progress=progress,
-                    _pbar_callback_id=pbar_tags_id,
-                ):
-                    if asyncio.iscoroutinefunction(observed_callback):
-                        loop = get_or_create_event_loop()
-                        coro = observed_callback(golden.input)
-                        loop.run_until_complete(
-                            asyncio.wait_for(
-                                coro,
-                                timeout=_per_task_timeout(),
-                            )
-                        )
-                    else:
-                        observed_callback(golden.input)
-                    current_trace: Trace = current_trace_context.get()
-                update_pbar(progress, pbar_tags_id, advance=total_tags)
-                update_pbar(progress, pbar_id)
-                # Create empty trace api for llm api test case
-                trace_api = create_api_trace(current_trace, golden)
-                # Format golden as test case to create llm api test case
-                test_case = LLMTestCase(
-                    input=golden.input,
-                    actual_output=(
-                        str(current_trace.output)
-                        if current_trace.output is not None
-                        else None
-                    ),
-                    expected_output=current_trace.expected_output,
-                    context=current_trace.context,
-                    retrieval_context=current_trace.retrieval_context,
-                    additional_metadata=golden.additional_metadata,
-                    tools_called=current_trace.tools_called,
-                    expected_tools=current_trace.expected_tools,
-                    comments=golden.comments,
-                    name=golden.name,
-                    _dataset_alias=golden._dataset_alias,
-                    _dataset_id=golden._dataset_id,
-                )
-                api_test_case = create_api_test_case(
-                    test_case=test_case,
-                    trace=trace_api,
-                    index=count if not _is_assert_test else None,
-                )
+            count += 1
-                # Run DFS to calculate metrics synchronously
-                def dfs(
-                    span: BaseSpan,
-                    progress: Optional[Progress] = None,
-                    pbar_eval_id: Optional[int] = None,
-                ):
-                    # Create API Span
-                    metrics: List[BaseMetric] = list(span.metrics or [])
-                    api_span: BaseApiSpan = (
-                        trace_manager._convert_span_to_api_span(span)
+            pbar_case_increments = (
+                0  # tracks how many times we advance `pbar_id` for this golden
+            )
+            emitted_trace = set()
+            current_trace: Optional[Trace] = None
+            trace_api = None
+            api_test_case = None
+            test_case = None
+            def _run_golden():
+                nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
+                # keep the evaluation context inside the timed function
+                with capture_evaluation_run("golden"):
+                    total_tags = count_observe_decorators_in_module(
+                        observed_callback
+                    )
+                    pbar_tags_id = add_pbar(
+                        progress,
+                        f"     ⚡ Invoking observed callback (#{count})",
+                        total=total_tags,
                     )
-                    if isinstance(span, AgentSpan):
-                        trace_api.agent_spans.append(api_span)
-                    elif isinstance(span, LlmSpan):
-                        trace_api.llm_spans.append(api_span)
-                        log_prompt(span, test_run_manager)
-                    elif isinstance(span, RetrieverSpan):
-                        trace_api.retriever_spans.append(api_span)
-                    elif isinstance(span, ToolSpan):
-                        trace_api.tool_spans.append(api_span)
-                    else:
-                        trace_api.base_spans.append(api_span)
-                    # Skip errored trace/span
-                    if _skip_metrics_for_error(span=span, trace=current_trace):
-                        api_span.status = TraceSpanApiStatus.ERRORED
-                        api_span.error = span.error or _trace_error(
-                            current_trace
-                        )
-                        if progress and pbar_eval_id is not None:
-                            update_pbar(
-                                progress,
-                                pbar_eval_id,
-                                advance=count_metrics_in_span_subtree(span),
+                    with Observer(
+                        "custom",
+                        func_name="Test Wrapper",
+                        _progress=progress,
+                        _pbar_callback_id=pbar_tags_id,
+                    ):
+                        if asyncio.iscoroutinefunction(observed_callback):
+                            loop = get_or_create_event_loop()
+                            coro = observed_callback(golden.input)
+                            loop.run_until_complete(
+                                _await_with_outer_deadline(
+                                    coro,
+                                    timeout=_per_task_timeout(),
+                                )
                             )
-                        return
+                        else:
+                            observed_callback(golden.input)
-                    for child in span.children:
-                        dfs(child, progress, pbar_eval_id)
+                        # we have a trace now
+                        current_trace = current_trace_context.get()
-                    if not span.metrics:
-                        return
-                    requires_trace = any(
-                        metric.requires_trace for metric in span.metrics
+                    update_pbar(progress, pbar_tags_id, advance=total_tags)
+                    update_pbar(progress, pbar_id)
+                    pbar_case_increments += 1
+                    # Create empty trace api for llm api test case
+                    trace_api = create_api_trace(current_trace, golden)
+                    # Build the test case and api test case
+                    test_case = LLMTestCase(
+                        input=golden.input,
+                        actual_output=(
+                            str(current_trace.output)
+                            if current_trace
+                            and current_trace.output is not None
+                            else None
+                        ),
+                        expected_output=(
+                            current_trace.expected_output
+                            if current_trace
+                            else None
+                        ),
+                        context=(
+                            current_trace.context if current_trace else None
+                        ),
+                        retrieval_context=(
+                            current_trace.retrieval_context
+                            if current_trace
+                            else None
+                        ),
+                        additional_metadata=golden.additional_metadata,
+                        tools_called=(
+                            current_trace.tools_called
+                            if current_trace
+                            else None
+                        ),
+                        expected_tools=(
+                            current_trace.expected_tools
+                            if current_trace
+                            else None
+                        ),
+                        comments=golden.comments,
+                        name=golden.name,
+                        _dataset_alias=golden._dataset_alias,
+                        _dataset_id=golden._dataset_id,
+                    )
+                    api_test_case = create_api_test_case(
+                        test_case=test_case,
+                        trace=trace_api,
+                        index=count if not _is_assert_test else None,
                     )
-                    llm_test_case = None
-                    if span.input is not None:
-                        llm_test_case = LLMTestCase(
-                            input=str(span.input),
-                            actual_output=(
-                                str(span.output)
-                                if span.output is not None
-                                else None
-                            ),
-                            expected_output=span.expected_output,
-                            context=span.context,
-                            retrieval_context=span.retrieval_context,
-                            tools_called=span.tools_called,
-                            expected_tools=span.expected_tools,
+                    # DFS and trace metric evaluation
+                    def dfs(
+                        span: BaseSpan,
+                        progress: Optional[Progress] = None,
+                        pbar_eval_id: Optional[int] = None,
+                    ):
+                        metrics: List[BaseMetric] = list(span.metrics or [])
+                        api_span: BaseApiSpan = (
+                            trace_manager._convert_span_to_api_span(span)
                         )
-                    # add trace if task completion
-                    if requires_trace:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(span)
-                        )
-                    else:
-                        if llm_test_case is None:
+                        if isinstance(span, AgentSpan):
+                            trace_api.agent_spans.append(api_span)
+                        elif isinstance(span, LlmSpan):
+                            trace_api.llm_spans.append(api_span)
+                            log_prompt(span, test_run_manager)
+                        elif isinstance(span, RetrieverSpan):
+                            trace_api.retriever_spans.append(api_span)
+                        elif isinstance(span, ToolSpan):
+                            trace_api.tool_spans.append(api_span)
+                        else:
+                            trace_api.base_spans.append(api_span)
+                        if _skip_metrics_for_error(
+                            span=span, trace=current_trace
+                        ):
                             api_span.status = TraceSpanApiStatus.ERRORED
-                            api_span.error = format_error_text(
-                                DeepEvalError(
-                                    "Span has metrics but no LLMTestCase. "
-                                    "Are you sure you called `update_current_span()`?"
-                                )
+                            api_span.error = span.error or _trace_error(
+                                current_trace
                             )
                             if progress and pbar_eval_id is not None:
                                 update_pbar(
@@ -1130,155 +1262,386 @@ def execute_agentic_test_cases(
                                 )
                             return
-                    # Preparing metric calculation
-                    api_span.metrics_data = []
-                    for metric in metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
-                    # Metric calculation
-                    for metric in metrics:
-                        metric_data = None
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        metric_data = create_metric_data(metric)
-                        api_span.metrics_data.append(metric_data)
-                        api_test_case.update_status(metric_data.success)
-                        update_pbar(progress, pbar_eval_id)
-                trace_level_metrics_count = (
-                    len(current_trace.metrics) if current_trace.metrics else 0
-                )
-                pbar_eval_id = add_pbar(
-                    progress,
-                    f"     🎯 Evaluating component(s) (#{count})",
-                    total=count_metrics_in_trace(trace=current_trace)
-                    + trace_level_metrics_count,
-                )
-                start_time = time.perf_counter()
+                        # evaluate children first
+                        for child in span.children:
+                            dfs(child, progress, pbar_eval_id)
-                skip_metrics_for_this_golden = False
-                # Handle trace-level metrics
-                if _skip_metrics_for_error(trace=current_trace):
-                    trace_api.status = TraceSpanApiStatus.ERRORED
-                    if progress and pbar_eval_id is not None:
-                        update_pbar(
-                            progress,
-                            pbar_eval_id,
-                            advance=count_total_metrics_for_trace(
-                                current_trace
-                            ),
+                        # If there are no metrics, then there is nothing to do on this span.
+                        if not metrics:
+                            return
+                        has_task_completion = any(
+                            isinstance(metric, TaskCompletionMetric)
+                            for metric in metrics
                         )
-                else:
-                    if current_trace.metrics:
                         requires_trace = any(
-                            metric.requires_trace
-                            for metric in current_trace.metrics
+                            getattr(metric, "requires_trace", False)
+                            for metric in metrics
                         )
                         llm_test_case = None
-                        if current_trace.input:
+                        if span.input is not None:
                             llm_test_case = LLMTestCase(
-                                input=str(current_trace.input),
+                                input=str(span.input),
                                 actual_output=(
-                                    str(current_trace.output)
-                                    if current_trace.output is not None
+                                    str(span.output)
+                                    if span.output is not None
                                     else None
                                 ),
-                                expected_output=current_trace.expected_output,
-                                context=current_trace.context,
-                                retrieval_context=current_trace.retrieval_context,
-                                tools_called=current_trace.tools_called,
-                                expected_tools=current_trace.expected_tools,
+                                expected_output=span.expected_output,
+                                context=span.context,
+                                retrieval_context=span.retrieval_context,
+                                tools_called=span.tools_called,
+                                expected_tools=span.expected_tools,
                             )
-                        if requires_trace:
+                        # If any metric needs a trace tree or a completion verdict, attach the trace
+                        if has_task_completion or requires_trace:
                             if llm_test_case is None:
                                 llm_test_case = LLMTestCase(input="None")
                             llm_test_case._trace_dict = (
-                                trace_manager.create_nested_spans_dict(
-                                    current_trace.root_spans[0]
-                                )
+                                trace_manager.create_nested_spans_dict(span)
                             )
                         else:
+                            # Without a test case we cannot evaluate span metrics
                             if llm_test_case is None:
-                                current_trace.status = TraceSpanStatus.ERRORED
-                                trace_api.status = TraceSpanApiStatus.ERRORED
-                                if current_trace.root_spans:
-                                    current_trace.root_spans[0].status = (
-                                        TraceSpanStatus.ERRORED
-                                    )
-                                    current_trace.root_spans[0].error = (
-                                        format_error_text(
-                                            DeepEvalError(
-                                                "Trace has metrics but no LLMTestCase (missing input/output). "
-                                                "Are you sure you called `update_current_trace()`?"
-                                            )
-                                        )
+                                api_span.status = TraceSpanApiStatus.ERRORED
+                                api_span.error = format_error_text(
+                                    DeepEvalError(
+                                        "Span has metrics but no LLMTestCase. "
+                                        "Are you sure you called `update_current_span()`?"
                                     )
+                                )
                                 if progress and pbar_eval_id is not None:
                                     update_pbar(
                                         progress,
                                         pbar_eval_id,
-                                        advance=count_total_metrics_for_trace(
-                                            current_trace
+                                        advance=count_metrics_in_span_subtree(
+                                            span
                                         ),
                                     )
-                                skip_metrics_for_this_golden = True
+                                return
+                        # Preparing metric calculation
+                        api_span.metrics_data = []
+                        for metric in metrics:
+                            metric.skipped = False
+                            metric.error = None
+                            if display_config.verbose_mode is not None:
+                                metric.verbose_mode = (
+                                    display_config.verbose_mode
+                                )
-                        if not skip_metrics_for_this_golden:
-                            for metric in current_trace.metrics:
-                                metric.skipped = False
-                                metric.error = None
-                                if display_config.verbose_mode is not None:
-                                    metric.verbose_mode = (
-                                        display_config.verbose_mode
+                        # Metric calculation
+                        for metric in metrics:
+                            res = _execute_metric(
+                                metric=metric,
+                                test_case=llm_test_case,
+                                show_metric_indicator=show_metric_indicator,
+                                in_component=True,
+                                error_config=error_config,
+                            )
+                            if res == "skip":
+                                continue
+                            metric_data = create_metric_data(metric)
+                            api_span.metrics_data.append(metric_data)
+                            api_test_case.update_status(metric_data.success)
+                            update_pbar(progress, pbar_eval_id)
+                    trace_level_metrics_count = (
+                        len(current_trace.metrics)
+                        if current_trace and current_trace.metrics
+                        else 0
+                    )
+                    pbar_eval_id = add_pbar(
+                        progress,
+                        f"     🎯 Evaluating component(s) (#{count})",
+                        total=count_metrics_in_trace(trace=current_trace)
+                        + trace_level_metrics_count,
+                    )
+                    start_time = time.perf_counter()
+                    skip_metrics_for_this_golden = False
+                    if _skip_metrics_for_error(trace=current_trace):
+                        trace_api.status = TraceSpanApiStatus.ERRORED
+                        if progress and pbar_eval_id is not None:
+                            update_pbar(
+                                progress,
+                                pbar_eval_id,
+                                advance=count_total_metrics_for_trace(
+                                    current_trace
+                                ),
+                            )
+                    else:
+                        if current_trace and current_trace.metrics:
+                            has_task_completion = any(
+                                isinstance(metric, TaskCompletionMetric)
+                                for metric in current_trace.metrics
+                            )
+                            requires_trace = any(
+                                getattr(metric, "requires_trace", False)
+                                for metric in current_trace.metrics
+                            )
+                            llm_test_case = None
+                            if current_trace.input:
+                                llm_test_case = LLMTestCase(
+                                    input=str(current_trace.input),
+                                    actual_output=(
+                                        str(current_trace.output)
+                                        if current_trace.output is not None
+                                        else None
+                                    ),
+                                    expected_output=current_trace.expected_output,
+                                    context=current_trace.context,
+                                    retrieval_context=current_trace.retrieval_context,
+                                    tools_called=current_trace.tools_called,
+                                    expected_tools=current_trace.expected_tools,
+                                )
+                            if has_task_completion or requires_trace:
+                                if llm_test_case is None:
+                                    llm_test_case = LLMTestCase(input="None")
+                                llm_test_case._trace_dict = (
+                                    trace_manager.create_nested_spans_dict(
+                                        current_trace.root_spans[0]
+                                    )
+                                )
+                            else:
+                                if llm_test_case is None:
+                                    current_trace.status = (
+                                        TraceSpanStatus.ERRORED
+                                    )
+                                    trace_api.status = (
+                                        TraceSpanApiStatus.ERRORED
                                     )
+                                    if current_trace.root_spans:
+                                        current_trace.root_spans[0].status = (
+                                            TraceSpanStatus.ERRORED
+                                        )
+                                        current_trace.root_spans[0].error = (
+                                            format_error_text(
+                                                DeepEvalError(
+                                                    "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                    "Are you sure you called `update_current_trace()`?"
+                                                )
+                                            )
+                                        )
+                                    if progress and pbar_eval_id is not None:
+                                        update_pbar(
+                                            progress,
+                                            pbar_eval_id,
+                                            advance=count_total_metrics_for_trace(
+                                                current_trace
+                                            ),
+                                        )
+                                    skip_metrics_for_this_golden = True
+                            if not skip_metrics_for_this_golden:
+                                for metric in current_trace.metrics:
+                                    metric.skipped = False
+                                    metric.error = None
+                                    if display_config.verbose_mode is not None:
+                                        metric.verbose_mode = (
+                                            display_config.verbose_mode
+                                        )
-                            trace_api.metrics_data = []
-                            for metric in current_trace.metrics:
-                                res = _execute_metric(
-                                    metric=metric,
-                                    test_case=llm_test_case,
-                                    show_metric_indicator=show_metric_indicator,
-                                    in_component=True,
-                                    error_config=error_config,
+                                trace_api.metrics_data = []
+                                for metric in current_trace.metrics:
+                                    res = _execute_metric(
+                                        metric=metric,
+                                        test_case=llm_test_case,
+                                        show_metric_indicator=show_metric_indicator,
+                                        in_component=True,
+                                        error_config=error_config,
+                                    )
+                                    if res == "skip":
+                                        continue
+                                    if not metric.skipped:
+                                        metric_data = create_metric_data(metric)
+                                        trace_api.metrics_data.append(
+                                            metric_data
+                                        )
+                                        api_test_case.update_metric_data(
+                                            metric_data
+                                        )
+                                        api_test_case.update_status(
+                                            metric_data.success
+                                        )
+                                        emitted_trace.add(id(metric))
+                                        update_pbar(progress, pbar_eval_id)
+                            # handle span metrics
+                            dfs(
+                                current_trace.root_spans[0],
+                                progress,
+                                pbar_eval_id,
+                            )
+                    # TODO: Do I need this block, or is it duplicated in finally?
+                    end_time = time.perf_counter()
+                    run_duration = end_time - start_time
+                    api_test_case.update_run_duration(run_duration)
+                    test_run_manager.update_test_run(api_test_case, test_case)
+                    test_results.append(create_test_result(api_test_case))
+                    test_results.extend(extract_trace_test_results(trace_api))
+                    update_pbar(progress, pbar_id)
+                    pbar_case_increments += 1
+            # run the golden with a timeout
+            start_time = time.perf_counter()
+            deadline = _per_task_timeout()
+            try:
+                run_sync_with_timeout(_run_golden, deadline)
+            except (asyncio.TimeoutError, TimeoutError):
+                # mark any not yet finished trace level and span level metrics as timed out.
+                msg = (
+                    f"Timed out after {deadline:.2f}s while executing agentic test case. "
+                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+                )
+                if current_trace is not None:
+                    # Trace-level metrics
+                    if getattr(current_trace, "metrics", None):
+                        for m in current_trace.metrics:
+                            if getattr(m, "skipped", False):
+                                continue
+                            # if already has a terminal state, leave it alone
+                            if getattr(
+                                m, "success", None
+                            ) is not None or getattr(m, "error", None):
+                                continue
+                            m.success = False
+                            m.error = msg
+                    # span level metrics, walk the tree
+                    def _walk(span):
+                        for child in getattr(span, "children", []) or []:
+                            _walk(child)
+                        for m in list(getattr(span, "metrics", []) or []):
+                            if getattr(m, "skipped", False):
+                                continue
+                            if getattr(
+                                m, "success", None
+                            ) is not None or getattr(m, "error", None):
+                                continue
+                            m.success = False
+                            m.error = msg
+                    for root in getattr(current_trace, "root_spans", []) or []:
+                        _walk(root)
+                # raise if we are not ignoring errors
+                if not error_config.ignore_errors:
+                    raise
+            finally:
+                try:
+                    # Ensure we have an api_test_case to attach results to.
+                    if api_test_case is None:
+                        # build a minimal test_case
+                        if test_case is None:
+                            out = (
+                                str(current_trace.output)
+                                if (
+                                    current_trace is not None
+                                    and current_trace.output is not None
                                 )
-                                if res == "skip":
+                                else None
+                            )
+                            test_case = LLMTestCase(
+                                input=golden.input,
+                                actual_output=out,
+                                expected_output=(
+                                    current_trace.expected_output
+                                    if current_trace
+                                    else None
+                                ),
+                                context=(
+                                    current_trace.context
+                                    if current_trace
+                                    else None
+                                ),
+                                retrieval_context=(
+                                    current_trace.retrieval_context
+                                    if current_trace
+                                    else None
+                                ),
+                                additional_metadata=golden.additional_metadata,
+                                tools_called=(
+                                    current_trace.tools_called
+                                    if current_trace
+                                    else None
+                                ),
+                                expected_tools=(
+                                    current_trace.expected_tools
+                                    if current_trace
+                                    else None
+                                ),
+                                comments=golden.comments,
+                                name=golden.name,
+                                _dataset_alias=golden._dataset_alias,
+                                _dataset_id=golden._dataset_id,
+                            )
+                        # Create a trace API if we have a trace
+                        if trace_api is None and current_trace is not None:
+                            trace_api = create_api_trace(current_trace, golden)
+                        api_test_case = create_api_test_case(
+                            test_case=test_case,
+                            trace=trace_api,
+                            index=count if not _is_assert_test else None,
+                        )
+                    if test_run is not None:
+                        test_run_manager.set_test_run(test_run)
+                    if api_test_case.success is None:
+                        api_test_case.update_status(False)
+                    # try to update metric data
+                    if current_trace is not None:
+                        if current_trace.metrics:
+                            for m in current_trace.metrics:
+                                if getattr(m, "skipped", False):
+                                    continue
+                                if id(m) in emitted_trace:
                                     continue
+                                api_test_case.update_metric_data(
+                                    create_metric_data(m)
+                                )
-                                if not metric.skipped:
-                                    metric_data = create_metric_data(metric)
-                                    trace_api.metrics_data.append(metric_data)
-                                    api_test_case.update_metric_data(
-                                        metric_data
-                                    )
-                                    api_test_case.update_status(
-                                        metric_data.success
-                                    )
-                                    update_pbar(progress, pbar_eval_id)
+                    # Finalize duration and persist
+                    elapsed = time.perf_counter() - start_time
+                    api_test_case.update_run_duration(
+                        elapsed if elapsed >= 0 else deadline
+                    )
-                        # Then handle span-level metrics
-                        dfs(current_trace.root_spans[0], progress, pbar_eval_id)
+                    if (
+                        api_test_case.metrics_data == []
+                        and api_test_case.trace is None
+                    ):
+                        api_test_case.metrics_data = None
-                end_time = time.perf_counter()
-                run_duration = end_time - start_time
-                # Update test run
-                api_test_case.update_run_duration(run_duration)
-                test_run_manager.update_test_run(api_test_case, test_case)
-                test_results.append(create_test_result(api_test_case))
-                test_results.extend(extract_trace_test_results(trace_api))
+                    test_run_manager.update_test_run(api_test_case, test_case)
+                    test_results.append(create_test_result(api_test_case))
-                update_pbar(progress, pbar_id)
+                    if trace_api is not None:
+                        test_results.extend(
+                            extract_trace_test_results(trace_api)
+                        )
+                    missing = 2 - pbar_case_increments
+                    if missing > 0:
+                        update_pbar(progress, pbar_id, advance=missing)
+                finally:
+                    # nothing to clean here, but keep symmetry with other paths
+                    pass
     if display_config.show_indicator and _use_bar_indicator:
         progress = Progress(
@@ -1319,9 +1682,9 @@ async def a_execute_agentic_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await asyncio.wait_for(
-                func(*args, **kwargs),
-                timeout=_per_task_timeout(),
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
             )
     test_run_manager = global_test_run_manager
@@ -1374,7 +1737,7 @@ async def a_execute_agentic_test_cases(
                     asyncio.gather(*tasks),
                     timeout=_gather_timeout(),
                 )
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 # Cancel any still-pending tasks and drain them
                 for t in tasks:
                     if not t.done():
@@ -1426,94 +1789,89 @@ async def _a_execute_agentic_test_case(
     progress: Optional[Progress] = None,
     pbar_id: Optional[int] = None,
 ):
-    if observed_callback:
-        total_tags = count_observe_decorators_in_module(observed_callback)
-        pbar_tags_id = add_pbar(
-            progress,
-            f"     ⚡ Invoking observed callback (#{count})",
-            total=total_tags,
-        )
+    test_start_time = time.perf_counter()
+    current_trace = None
+    trace_api = None
+    test_case = None
+    api_test_case = None
+    try:
+        if observed_callback:
+            total_tags = count_observe_decorators_in_module(observed_callback)
+            pbar_tags_id = add_pbar(
+                progress,
+                f"     ⚡ Invoking observed callback (#{count})",
+                total=total_tags,
+            )
-        # Call callback and extract trace
-        with Observer(
-            "custom",
-            func_name="Test Wrapper",
-            _progress=progress,
-            _pbar_callback_id=pbar_tags_id,
-        ):
-            if asyncio.iscoroutinefunction(observed_callback):
-                await asyncio.wait_for(
-                    observed_callback(golden.input),
-                    timeout=_per_task_timeout(),
-                )
-            else:
-                observed_callback(golden.input)
-            current_trace: Trace = current_trace_context.get()
+            # Call callback and extract trace
+            with Observer(
+                "custom",
+                func_name="Test Wrapper",
+                _progress=progress,
+                _pbar_callback_id=pbar_tags_id,
+            ):
+                # get current_trace right away, we need it even if cancelled
+                current_trace: Trace = current_trace_context.get()
+                if asyncio.iscoroutinefunction(observed_callback):
+                    await _await_with_outer_deadline(
+                        observed_callback,
+                        golden.input,
+                        timeout=_per_task_timeout(),
+                    )
+                else:
+                    observed_callback(golden.input)
-        update_pbar(progress, pbar_tags_id, advance=total_tags)
-        update_pbar(progress, pbar_id)
+            update_pbar(progress, pbar_tags_id, advance=total_tags)
+            update_pbar(progress, pbar_id)
-    elif trace:
-        current_trace = trace
+        elif trace:
+            current_trace = trace
-    if trace_metrics:
-        current_trace.metrics = trace_metrics
+        trace_level_metrics_count = 0
-    # run evals through DFS
-    trace_api = create_api_trace(trace=current_trace, golden=golden)
+        if trace_metrics:
+            current_trace.metrics = trace_metrics
-    trace_level_metrics_count = (
-        len(current_trace.metrics) if current_trace.metrics else 0
-    )
+        # run evals through DFS
+        trace_api = create_api_trace(trace=current_trace, golden=golden)
-    pbar_eval_id = add_pbar(
-        progress,
-        f"     🎯 Evaluating component(s) (#{count})",
-        total=count_metrics_in_trace(trace=current_trace)
-        + trace_level_metrics_count,
-    )
+        trace_level_metrics_count = (
+            len(current_trace.metrics) if current_trace.metrics else 0
+        )
-    test_case = LLMTestCase(
-        input=golden.input,
-        actual_output=(
-            str(current_trace.output)
-            if current_trace.output is not None
-            else None
-        ),
-        expected_output=current_trace.expected_output,
-        context=current_trace.context,
-        retrieval_context=current_trace.retrieval_context,
-        tools_called=current_trace.tools_called,
-        expected_tools=current_trace.expected_tools,
-        additional_metadata=golden.additional_metadata,
-        comments=golden.comments,
-        name=golden.name,
-        _dataset_alias=golden._dataset_alias,
-        _dataset_id=golden._dataset_id,
-    )
-    api_test_case = create_api_test_case(
-        test_case=test_case,
-        trace=trace_api,
-        index=count if not _is_assert_test else None,
-    )
+        pbar_eval_id = add_pbar(
+            progress,
+            f"     🎯 Evaluating component(s) (#{count})",
+            total=count_metrics_in_trace(trace=current_trace)
+            + trace_level_metrics_count,
+        )
-    await _a_execute_trace_test_case(
-        trace=current_trace,
-        trace_api=trace_api,
-        api_test_case=api_test_case,
-        ignore_errors=ignore_errors,
-        skip_on_missing_params=skip_on_missing_params,
-        show_indicator=show_indicator,
-        verbose_mode=verbose_mode,
-        progress=progress,
-        pbar_eval_id=pbar_eval_id,
-        _use_bar_indicator=_use_bar_indicator,
-    )
+        test_case = LLMTestCase(
+            input=golden.input,
+            actual_output=(
+                str(current_trace.output)
+                if current_trace.output is not None
+                else None
+            ),
+            expected_output=current_trace.expected_output,
+            context=current_trace.context,
+            retrieval_context=current_trace.retrieval_context,
+            tools_called=current_trace.tools_called,
+            expected_tools=current_trace.expected_tools,
+            additional_metadata=golden.additional_metadata,
+            comments=golden.comments,
+            name=golden.name,
+            _dataset_alias=golden._dataset_alias,
+            _dataset_id=golden._dataset_id,
+        )
+        api_test_case = create_api_test_case(
+            test_case=test_case,
+            trace=trace_api,
+            index=count if not _is_assert_test else None,
+        )
-    async def dfs(trace: Trace, span: BaseSpan):
-        await _a_execute_span_test_case(
-            span=span,
-            current_trace=trace,
+        await _a_execute_trace_test_case(
+            trace=current_trace,
             trace_api=trace_api,
             api_test_case=api_test_case,
             ignore_errors=ignore_errors,
@@ -1522,56 +1880,150 @@ async def _a_execute_agentic_test_case(
             verbose_mode=verbose_mode,
             progress=progress,
             pbar_eval_id=pbar_eval_id,
-            test_run_manager=test_run_manager,
             _use_bar_indicator=_use_bar_indicator,
         )
-        if _skip_metrics_for_error(span=span, trace=trace):
-            return
+        async def dfs(trace: Trace, span: BaseSpan):
+            await _a_execute_span_test_case(
+                span=span,
+                current_trace=trace,
+                trace_api=trace_api,
+                api_test_case=api_test_case,
+                ignore_errors=ignore_errors,
+                skip_on_missing_params=skip_on_missing_params,
+                show_indicator=show_indicator,
+                verbose_mode=verbose_mode,
+                progress=progress,
+                pbar_eval_id=pbar_eval_id,
+                test_run_manager=test_run_manager,
+                _use_bar_indicator=_use_bar_indicator,
+            )
-        child_tasks = [
-            asyncio.create_task(dfs(trace, child)) for child in span.children
-        ]
-        if child_tasks:
-            try:
-                await asyncio.wait_for(
-                    asyncio.gather(*child_tasks),
-                    timeout=_gather_timeout(),
-                )
-            except asyncio.TimeoutError:
-                for t in child_tasks:
-                    if not t.done():
-                        t.cancel()
-                await asyncio.gather(*child_tasks, return_exceptions=True)
-                raise
+            if _skip_metrics_for_error(span=span, trace=trace):
+                return
-    test_start_time = time.perf_counter()
+            child_tasks = [
+                asyncio.create_task(dfs(trace, child))
+                for child in span.children
+            ]
+            if child_tasks:
+                try:
+                    await asyncio.wait_for(
+                        asyncio.gather(*child_tasks),
+                        timeout=_gather_timeout(),
+                    )
+                except (asyncio.TimeoutError, TimeoutError):
+                    for t in child_tasks:
+                        if not t.done():
+                            t.cancel()
+                    await asyncio.gather(*child_tasks, return_exceptions=True)
+                    raise
-    if not _skip_metrics_for_error(trace=current_trace):
-        if current_trace and current_trace.root_spans:
-            await dfs(current_trace, current_trace.root_spans[0])
-        else:
-            if (
-                logger.isEnabledFor(logging.DEBUG)
-                and get_settings().DEEPEVAL_VERBOSE_MODE
-            ):
-                logger.debug(
-                    "Skipping DFS: empty trace or no root spans (trace=%s)",
-                    current_trace.uuid if current_trace else None,
+        if not _skip_metrics_for_error(trace=current_trace):
+            if current_trace and current_trace.root_spans:
+                await dfs(current_trace, current_trace.root_spans[0])
+            else:
+                if (
+                    logger.isEnabledFor(logging.DEBUG)
+                    and get_settings().DEEPEVAL_VERBOSE_MODE
+                ):
+                    logger.debug(
+                        "Skipping DFS: empty trace or no root spans (trace=%s)",
+                        current_trace.uuid if current_trace else None,
+                    )
+    except asyncio.CancelledError:
+        # mark any unfinished metrics as cancelled
+        cancel_msg = (
+            "Timed out/cancelled while evaluating agentic test case. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        if trace_metrics:
+            for m in trace_metrics:
+                if getattr(m, "skipped", False):
+                    continue
+                if getattr(m, "success", None) is None and not getattr(
+                    m, "error", None
+                ):
+                    m.success = False
+                    m.error = cancel_msg
+        if trace is not None and trace.metrics:
+            for m in trace.metrics:
+                if getattr(m, "skipped", False):
+                    continue
+                if getattr(m, "success", None) is None and not getattr(
+                    m, "error", None
+                ):
+                    m.success = False
+                    m.error = cancel_msg
+        if not ignore_errors:
+            raise
+    finally:
+        try:
+            if api_test_case is None:
+                if test_case is None:
+                    test_case = LLMTestCase(
+                        input=golden.input,
+                        actual_output=None,
+                        expected_output=None,
+                        context=None,
+                        retrieval_context=None,
+                        additional_metadata=golden.additional_metadata,
+                        tools_called=None,
+                        expected_tools=None,
+                        comments=golden.comments,
+                        name=golden.name,
+                        _dataset_alias=golden._dataset_alias,
+                        _dataset_id=golden._dataset_id,
+                    )
+                if trace is not None and trace_api is None:
+                    trace_api = create_api_trace(trace, golden)
+                api_test_case = create_api_test_case(
+                    test_case=test_case,
+                    trace=trace_api,
+                    index=(count if not _is_assert_test else None),
                 )
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
+            # attach MetricData for any trace metrics we marked above
+            if trace_metrics:
+                for m in trace_metrics:
+                    if getattr(m, "skipped", False):
+                        continue
+                    api_test_case.update_metric_data(create_metric_data(m))
+            # If nothing set success yet, mark the case failed
+            if api_test_case.success is None:
+                api_test_case.update_status(False)
-    api_test_case.update_run_duration(run_duration)
-    test_run_manager.update_test_run(api_test_case, test_case)
-    main_result = create_test_result(api_test_case)
-    trace_results = extract_trace_test_results(trace_api)
-    unique_trace_results = filter_duplicate_results(main_result, trace_results)
-    test_results.append(main_result)
-    test_results.extend(unique_trace_results)
+            # test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
+            # Set it to None to ensure the test_case is added
+            if api_test_case.metrics_data == [] and api_test_case.trace is None:
+                api_test_case.metrics_data = None
-    update_pbar(progress, pbar_id)
+            # Duration & persist
+            test_end_time = time.perf_counter()
+            run_duration = test_end_time - test_start_time
+            api_test_case.update_run_duration(run_duration)
+            test_run_manager.update_test_run(api_test_case, test_case)
+            # Build results and de-duplicate against trace results
+            main_result = create_test_result(api_test_case)
+            trace_results = (
+                extract_trace_test_results(trace_api)
+                if trace_api is not None
+                else []
+            )
+            unique_trace_results = filter_duplicate_results(
+                main_result, trace_results
+            )
+            test_results.append(main_result)
+            test_results.extend(unique_trace_results)
+            update_pbar(progress, pbar_id)
+        finally:
+            pass
 async def _a_execute_span_test_case(
@@ -2177,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
     async def execute_callback_with_semaphore(coroutine: Awaitable):
         async with semaphore:
-            return await asyncio.wait_for(
-                coroutine, timeout=_per_task_timeout()
-            )
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(coroutine, timeout=timeout)
     def evaluate_test_cases(
         progress: Optional[Progress] = None,
@@ -2328,16 +2779,25 @@ def a_execute_agentic_test_cases_from_loop(
                             meta,
                         )
                     elif exc is not None:
+                        show_trace = bool(
+                            get_settings().DEEPEVAL_LOG_STACK_TRACES
+                        )
+                        exc_info = (
+                            (
+                                type(exc),
+                                exc,
+                                getattr(exc, "__traceback__", None),
+                            )
+                            if show_trace
+                            else None
+                        )
                         logger.error(
                             "[deepeval] task ERROR %s after %.2fs meta=%r",
                             t.get_name(),
                             duration,
                             meta,
-                            exc_info=(
-                                type(exc),
-                                exc,
-                                getattr(exc, "__traceback__", None),
-                            ),
+                            exc_info=exc_info,
                         )
                     else:
                         logger.info(
@@ -2396,7 +2856,7 @@ def a_execute_agentic_test_cases_from_loop(
                     )
                 )
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 import traceback
                 pending = [t for t in created_tasks if not t.done()]
@@ -2609,9 +3069,9 @@ async def _a_evaluate_traces(
     async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await asyncio.wait_for(
-                func(*args, **kwargs),
-                timeout=_per_task_timeout(),
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
             )
     eval_tasks = []
@@ -2661,7 +3121,7 @@ async def _a_evaluate_traces(
             asyncio.gather(*eval_tasks),
             timeout=_gather_timeout(),
         )
-    except asyncio.TimeoutError:
+    except (asyncio.TimeoutError, TimeoutError):
         for t in eval_tasks:
             if not t.done():
                 t.cancel()
@@ -2689,9 +3149,9 @@ async def _evaluate_test_case_pairs(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await asyncio.wait_for(
-                func(*args, **kwargs),
-                timeout=_per_task_timeout(),
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
             )
     tasks = []
@@ -2731,7 +3191,7 @@ async def _evaluate_test_case_pairs(
             asyncio.gather(*tasks),
             timeout=_gather_timeout(),
         )
-    except asyncio.TimeoutError:
+    except (asyncio.TimeoutError, TimeoutError):
         # Cancel any still-pending tasks and drain them
         for t in tasks:
             if not t.done():
@@ -2756,6 +3216,9 @@ def _execute_metric(
         )
     except MissingTestCaseParamsError as e:
         if error_config.skip_on_missing_params:
+            metric.skipped = True
+            metric.error = None
+            metric.success = None
             return "skip"
         else:
             if error_config.ignore_errors:
@@ -2768,6 +3231,9 @@ def _execute_metric(
             metric.measure(test_case)
         except MissingTestCaseParamsError as e:
             if error_config.skip_on_missing_params:
+                metric.skipped = True
+                metric.error = None
+                metric.success = None
                 return "skip"
             else:
                 if error_config.ignore_errors:

deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

deepeval 3.6.8py3-none-any.whl → 3.7.0py3-none-any.whl