PyPI - deepeval - Versions diffs - 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl - Mend

deepeval 3.6.7py3-none-any.whl → 3.6.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

deepeval/_version.py +1 -1
deepeval/config/settings.py +104 -36
deepeval/config/utils.py +5 -0
deepeval/dataset/dataset.py +162 -30
deepeval/dataset/utils.py +41 -13
deepeval/errors.py +20 -2
deepeval/evaluate/execute.py +1662 -688
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +13 -3
deepeval/integrations/crewai/__init__.py +2 -1
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +8 -8
deepeval/metrics/indicator.py +21 -1
deepeval/metrics/mcp/mcp_task_completion.py +7 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/task_completion/task_completion.py +1 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/amazon_bedrock_model.py +20 -17
deepeval/models/llms/openai_model.py +10 -1
deepeval/models/retry_policy.py +103 -20
deepeval/openai/extractors.py +61 -16
deepeval/openai/patch.py +8 -12
deepeval/openai/types.py +1 -1
deepeval/openai/utils.py +108 -1
deepeval/prompt/prompt.py +1 -0
deepeval/prompt/utils.py +43 -14
deepeval/simulator/conversation_simulator.py +25 -18
deepeval/synthesizer/chunking/context_generator.py +9 -1
deepeval/synthesizer/synthesizer.py +11 -10
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/test_run.py +190 -207
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +23 -4
deepeval/tracing/trace_context.py +53 -38
deepeval/tracing/tracing.py +23 -0
deepeval/tracing/types.py +16 -14
deepeval/utils.py +21 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
deepeval/integrations/llama_index/agent/patched.py +0 -68
deepeval/tracing/message_types/__init__.py +0 -10
deepeval/tracing/message_types/base.py +0 -6
deepeval/tracing/message_types/messages.py +0 -14
deepeval/tracing/message_types/tools.py +0 -18
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0

deepeval/evaluate/execute.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import logging
 from rich.progress import (
@@ -43,9 +44,14 @@ from deepeval.tracing.api import (
 )
 from deepeval.dataset import Golden
 from deepeval.contextvars import set_current_golden, reset_current_golden
-from deepeval.errors import MissingTestCaseParamsError
+from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
 from deepeval.metrics.utils import copy_metrics
-from deepeval.utils import get_or_create_event_loop, shorten, len_medium
+from deepeval.utils import (
+    get_or_create_event_loop,
+    shorten,
+    len_medium,
+    format_error_text,
+)
 from deepeval.telemetry import capture_evaluation_run
 from deepeval.metrics import (
     BaseMetric,
@@ -56,6 +62,11 @@ from deepeval.metrics import (
 from deepeval.metrics.indicator import (
     measure_metrics_with_indicator,
 )
+from deepeval.models.retry_policy import (
+    set_outer_deadline,
+    reset_outer_deadline,
+    run_sync_with_timeout,
+)
 from deepeval.test_case import (
     LLMTestCase,
     ConversationalTestCase,
@@ -82,10 +93,13 @@ from deepeval.evaluate.utils import (
     create_metric_data,
     create_test_result,
     count_metrics_in_trace,
+    count_total_metrics_for_trace,
+    count_metrics_in_span_subtree,
     extract_trace_test_results,
 )
 from deepeval.utils import add_pbar, update_pbar, custom_console
-from deepeval.tracing.types import TestCaseMetricPair
+from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
+from deepeval.tracing.api import TraceSpanApiStatus
 from deepeval.config.settings import get_settings
 from deepeval.test_run import TEMP_FILE_PATH
 from deepeval.confident.api import is_confident
@@ -97,6 +111,108 @@ from deepeval.test_run.hyperparameters import (
 logger = logging.getLogger(__name__)
+def _skip_metrics_for_error(
+    span: Optional[BaseSpan] = None,
+    trace: Optional[Trace] = None,
+) -> bool:
+    # trace failure: skip everything under this trace
+    if trace is not None and trace.status == TraceSpanStatus.ERRORED:
+        return True
+    # span failure: skip this span’s metrics
+    if span is not None and span.status == TraceSpanStatus.ERRORED:
+        return True
+    return False
+def _trace_error(current_trace: Trace) -> Optional[str]:
+    def _first_err(s: BaseSpan) -> Optional[str]:
+        if s.status == TraceSpanStatus.ERRORED and s.error:
+            return s.error
+        for c in s.children or []:
+            e = _first_err(c)
+            if e:
+                return e
+        return None
+    for root in current_trace.root_spans or []:
+        e = _first_err(root)
+        if e:
+            return e
+    return None
+def _get_trace_by_uuid_anywhere(trace_uuid: str):
+    """
+    Resolver for a trace UUID across the manager's state.
+    First tries the manager's indexed lookup, which (covers active/in-flight traces,
+    then does a linear scan of the full `trace_manager.traces` list, which covers
+    traces that were recorded/closed earlier or not yet indexed. Returns
+    the concrete Trace object or None if not found.
+    """
+    tr = trace_manager.get_trace_by_uuid(trace_uuid)
+    if tr:
+        return tr
+    for tr in trace_manager.traces:
+        if tr.uuid == trace_uuid:
+            return tr
+    return None
+def _pick_root_for_marking(trace):
+    """
+    Choose the most appropriate root span to annotate on error/cancel.
+    Heuristic:
+      - Prefer the most recent open root, which will have no `end_time` since this is the
+        span currently in flight.
+      - If none are open, use the last root span if it exists.
+      - If the trace has no roots, return None.
+    This favors marking the active root in multi root traces while remaining
+    stable for already closed traces.
+    """
+    open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
+    return (
+        open_roots[-1]
+        if open_roots
+        else (trace.root_spans[-1] if trace.root_spans else None)
+    )
+def _resolve_trace_and_root_for_task(t: asyncio.Task):
+    """
+    Resolve trace and root for a completed task using the weak binding map.
+    Steps:
+      1. Look up the task in `trace_manager.task_bindings` to get the
+         bound `trace_uuid` and, if available, `root_span_uuid`.
+      2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
+      3. If a bound root UUID exists, try to find that exact root on the trace.
+      4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
+    Returns a trace / root tuple. Either may be `None` when no binding is
+    present. This function is used by `on_task_done` to robustly mark error/cancel
+    states without assuming a single root trace or a root that is still open.
+    """
+    binding = trace_manager.task_bindings.get(t) or {}
+    trace_uuid = binding.get("trace_uuid")
+    root_span_uuid = binding.get("root_span_uuid")
+    trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
+    root = None
+    if trace and root_span_uuid:
+        root = next(
+            (rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
+        )
+    if trace and root is None:
+        root = _pick_root_for_marking(trace)
+    return trace, root
 async def _snapshot_tasks():
     cur = asyncio.current_task()
     # `all_tasks` returns tasks for the current running loop only
@@ -115,6 +231,32 @@ def _gather_timeout() -> float:
     )
+def filter_duplicate_results(
+    main_result: TestResult, results: List[TestResult]
+) -> List[TestResult]:
+    return [
+        result
+        for result in results
+        if not (
+            (result.input == main_result.input)
+            and (result.actual_output == main_result.actual_output)
+            and (result.metrics_data == main_result.metrics_data)
+        )
+    ]
+async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
+    token = set_outer_deadline(timeout)
+    try:
+        if inspect.isawaitable(obj):
+            coro = obj
+        else:
+            coro = obj(*args, **kwargs)
+        return await asyncio.wait_for(coro, timeout=timeout)
+    finally:
+        reset_outer_deadline(token)
 ###########################################
 ### E2E Evals #############################
 ###########################################
@@ -146,6 +288,13 @@ def execute_test_cases(
     test_run_manager.save_to_disk = cache_config.write_cache
     test_run = test_run_manager.get_test_run(identifier=identifier)
+    if test_run is None:
+        # ensure we have a test_run ( in case it couldn't be loaded from disk )
+        test_run_manager.create_test_run(identifier=identifier)
+        test_run = test_run_manager.get_test_run(identifier=identifier)
+    # capture once for inner closures
+    hyperparameters = test_run.hyperparameters if test_run is not None else None
     if display_config.verbose_mode is not None:
         for metric in metrics:
@@ -166,176 +315,228 @@ def execute_test_cases(
     test_results: List[TestResult] = []
     def evaluate_test_cases(
-        progress: Optional[Progress] = None, pbar_id: Optional[str] = None
+        progress: Optional[Progress] = None, pbar_id: Optional[int] = None
     ):
         llm_test_case_count = -1
+        mllm_test_case_count = -1
         conversational_test_case_count = -1
         show_metric_indicator = (
             display_config.show_indicator and not _use_bar_indicator
         )
         for i, test_case in enumerate(test_cases):
+            # skip what we know we won't run
+            if isinstance(test_case, LLMTestCase):
+                if not llm_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(llm_metrics)
+            elif isinstance(test_case, MLLMTestCase):
+                if not mllm_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(mllm_metrics)
+            elif isinstance(test_case, ConversationalTestCase):
+                if not conversational_metrics:
+                    update_pbar(progress, pbar_id)
+                    continue
+                per_case_total = len(conversational_metrics)
             pbar_test_case_id = add_pbar(
                 progress,
                 f"    🎯 Evaluating test case #{i}",
-                total=len(metrics),
+                total=per_case_total,
             )
-            with capture_evaluation_run("test case"):
-                for metric in metrics:
-                    metric.error = None  # Reset metric error
-                if isinstance(test_case, LLMTestCase):
-                    if len(llm_metrics) == 0:
-                        continue
-                    llm_test_case_count += 1
-                    cached_test_case = None
-                    if cache_config.use_cache:
-                        cached_test_case = (
-                            global_test_run_cache_manager.get_cached_test_case(
-                                test_case, test_run.hyperparameters
-                            )
-                        )
-                    ##### Metric Calculation #####
-                    api_test_case: LLMApiTestCase = create_api_test_case(
-                        test_case=test_case, index=llm_test_case_count
-                    )
-                    new_cached_test_case: CachedTestCase = CachedTestCase()
-                    test_start_time = time.perf_counter()
-                    read_all_metrics_from_cache = True
-                    for metric in llm_metrics:
-                        metric_data = None
-                        if cached_test_case is not None:
-                            cached_metric_data = Cache.get_metric_data(
-                                metric, cached_test_case
-                            )
-                            if cached_metric_data:
-                                metric_data = cached_metric_data.metric_data
-                        if metric_data is None:
-                            read_all_metrics_from_cache = False
-                            res = _execute_metric(
-                                metric=metric,
-                                test_case=test_case,
-                                show_metric_indicator=show_metric_indicator,
-                                in_component=False,
-                                error_config=error_config,
-                            )
-                            if res == "skip":
-                                continue
-                            metric_data = create_metric_data(metric)
-                        # here, we will check for an additional property on the flattened test cases to see if updating is necessary
-                        api_test_case.update_metric_data(metric_data)
-                        if metric.error is None:
-                            cache_metric_data = deepcopy(metric_data)
-                            cache_metric_data.evaluation_cost = 0  # Cached metrics will have evaluation cost as 0, not None.
-                            updated_cached_metric_data = CachedMetricData(
-                                metric_data=cache_metric_data,
-                                metric_configuration=Cache.create_metric_configuration(
-                                    metric
-                                ),
-                            )
-                            new_cached_test_case.cached_metrics_data.append(
-                                updated_cached_metric_data
-                            )
-                        update_pbar(progress, pbar_test_case_id)
-                    test_end_time = time.perf_counter()
-                    if read_all_metrics_from_cache:
-                        run_duration = 0
-                    else:
-                        run_duration = test_end_time - test_start_time
-                    api_test_case.update_run_duration(run_duration)
-                    ### Update Test Run ###
-                    test_run_manager.update_test_run(api_test_case, test_case)
-                    ### Cache Test Run ###
-                    global_test_run_cache_manager.cache_test_case(
-                        test_case,
-                        new_cached_test_case,
-                        test_run.hyperparameters,
-                    )
-                    global_test_run_cache_manager.cache_test_case(
-                        test_case,
-                        new_cached_test_case,
-                        test_run.hyperparameters,
-                        to_temp=True,
+            metrics_for_case = (
+                llm_metrics
+                if isinstance(test_case, LLMTestCase)
+                else (
+                    mllm_metrics
+                    if isinstance(test_case, MLLMTestCase)
+                    else conversational_metrics
+                )
+            )
+            api_test_case = create_api_test_case(
+                test_case=test_case,
+                index=(
+                    llm_test_case_count + 1
+                    if isinstance(test_case, LLMTestCase)
+                    else (
+                        mllm_test_case_count + 1
+                        if isinstance(test_case, MLLMTestCase)
+                        else conversational_test_case_count + 1
                     )
+                ),
+            )
+            emitted = [False] * len(metrics_for_case)
+            index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
+            current_index = -1
+            start_time = time.perf_counter()
+            deadline_timeout = _per_task_timeout()
+            deadline_token = set_outer_deadline(deadline_timeout)
+            new_cached_test_case: CachedTestCase = None
+            try:
-                # No caching and not sending test cases to Confident AI for multimodal metrics yet
-                elif isinstance(test_case, MLLMTestCase):
-                    if len(mllm_metrics) == 0:
+                def _run_case():
+                    nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
+                    with capture_evaluation_run("test case"):
+                        for metric in metrics:
+                            metric.error = None  # Reset metric error
+                        if isinstance(test_case, LLMTestCase):
+                            llm_test_case_count += 1
+                            cached_test_case = None
+                            if cache_config.use_cache:
+                                cached_test_case = global_test_run_cache_manager.get_cached_test_case(
+                                    test_case, hyperparameters
+                                )
+                            ##### Metric Calculation #####
+                            new_cached_test_case = CachedTestCase()
+                            for metric in llm_metrics:
+                                current_index = index_of[id(metric)]
+                                metric_data = None
+                                if cached_test_case is not None:
+                                    cached_metric_data = Cache.get_metric_data(
+                                        metric, cached_test_case
+                                    )
+                                    if cached_metric_data:
+                                        metric_data = (
+                                            cached_metric_data.metric_data
+                                        )
+                                if metric_data is None:
+                                    res = _execute_metric(
+                                        metric=metric,
+                                        test_case=test_case,
+                                        show_metric_indicator=show_metric_indicator,
+                                        in_component=False,
+                                        error_config=error_config,
+                                    )
+                                    if res == "skip":
+                                        continue
+                                    metric_data = create_metric_data(metric)
+                                # here, we will check for an additional property on the flattened test cases to see if updating is necessary
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                if metric.error is None:
+                                    cache_metric_data = deepcopy(metric_data)
+                                    cache_metric_data.evaluation_cost = 0  # Cached metrics will have evaluation cost as 0, not None.
+                                    updated_cached_metric_data = CachedMetricData(
+                                        metric_data=cache_metric_data,
+                                        metric_configuration=Cache.create_metric_configuration(
+                                            metric
+                                        ),
+                                    )
+                                    new_cached_test_case.cached_metrics_data.append(
+                                        updated_cached_metric_data
+                                    )
+                                update_pbar(progress, pbar_test_case_id)
+                        # No caching and not sending test cases to Confident AI for multimodal metrics yet
+                        elif isinstance(test_case, MLLMTestCase):
+                            mllm_test_case_count += 1
+                            for metric in mllm_metrics:
+                                current_index = index_of[id(metric)]
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=False,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
+                                metric_data = create_metric_data(metric)
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                update_pbar(progress, pbar_test_case_id)
+                        # No caching for conversational metrics yet
+                        elif isinstance(test_case, ConversationalTestCase):
+                            conversational_test_case_count += 1
+                            for metric in conversational_metrics:
+                                current_index = index_of[id(metric)]
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=False,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
+                                metric_data = create_metric_data(metric)
+                                api_test_case.update_metric_data(metric_data)
+                                emitted[current_index] = True
+                                update_pbar(progress, pbar_test_case_id)
+                run_sync_with_timeout(_run_case, deadline_timeout)
+            except (asyncio.TimeoutError, TimeoutError):
+                msg = (
+                    f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
+                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+                )
+                for i, m in enumerate(metrics_for_case):
+                    if getattr(m, "skipped", False):
                         continue
-                    api_test_case: LLMApiTestCase = create_api_test_case(
-                        test_case=test_case, index=llm_test_case_count
-                    )
-                    test_start_time = time.perf_counter()
-                    for metric in mllm_metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=False,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        metric_data = create_metric_data(metric)
-                        api_test_case.update_metric_data(metric_data)
-                        update_pbar(progress, pbar_test_case_id)
-                    test_end_time = time.perf_counter()
-                    if len(mllm_metrics) > 0:
-                        run_duration = test_end_time - test_start_time
-                        api_test_case.update_run_duration(run_duration)
-                    ### Update Test Run ###
-                    test_run_manager.update_test_run(api_test_case, test_case)
-                # No caching for conversational metrics yet
-                elif isinstance(test_case, ConversationalTestCase):
-                    if len(metrics) == 0:
+                    # already finished or errored? leave it
+                    if getattr(m, "success", None) is not None or getattr(
+                        m, "error", None
+                    ):
                         continue
+                    if i == current_index:
+                        m.success = False
+                        m.error = msg
+                    elif i > current_index:
+                        m.success = False
+                        m.error = "Skipped due to case timeout."
+                if not error_config.ignore_errors:
+                    raise
-                    conversational_test_case_count += 1
-                    api_test_case: ConversationalApiTestCase = (
-                        create_api_test_case(
-                            test_case=test_case,
-                            index=conversational_test_case_count,
+            finally:
+                try:
+                    if (
+                        isinstance(test_case, LLMTestCase)
+                        and new_cached_test_case is not None
+                    ):
+                        ### Cache Test Run ###
+                        global_test_run_cache_manager.cache_test_case(
+                            test_case,
+                            new_cached_test_case,
+                            hyperparameters,
                         )
-                    )
-                    test_start_time = time.perf_counter()
-                    for metric in metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=False,
-                            error_config=error_config,
+                        global_test_run_cache_manager.cache_test_case(
+                            test_case,
+                            new_cached_test_case,
+                            hyperparameters,
+                            to_temp=True,
                         )
-                        if res == "skip":
-                            continue
-                        metric_data = create_metric_data(metric)
-                        api_test_case.update_metric_data(metric_data)
-                        update_pbar(progress, pbar_test_case_id)
-                    test_end_time = time.perf_counter()
-                    run_duration = test_end_time - test_start_time
-                    api_test_case.update_run_duration(run_duration)
+                    # Attach MetricData for *all* metrics (finished or synthesized)
+                    for i, m in enumerate(metrics_for_case):
+                        if getattr(m, "skipped", False):
+                            continue
+                        if not emitted[i]:
+                            api_test_case.update_metric_data(
+                                create_metric_data(m)
+                            )
-                    ### Update Test Run ###
+                    elapsed = time.perf_counter() - start_time
+                    api_test_case.update_run_duration(
+                        elapsed if elapsed >= 0 else deadline_timeout
+                    )
                     test_run_manager.update_test_run(api_test_case, test_case)
-                test_result = create_test_result(api_test_case)
-                test_results.append(test_result)
-                update_pbar(progress, pbar_id)
+                    test_results.append(create_test_result(api_test_case))
+                    update_pbar(progress, pbar_id)
+                finally:
+                    reset_outer_deadline(deadline_token)
     if display_config.show_indicator and _use_bar_indicator:
         progress = Progress(
@@ -380,7 +581,10 @@ async def a_execute_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
+            )
     global_test_run_cache_manager.disable_write_cache = (
         cache_config.write_cache is False
@@ -483,7 +687,7 @@ async def a_execute_test_cases(
                         task = execute_with_semaphore(
                             func=_a_execute_conversational_test_cases,
-                            metrics=copy_metrics(metrics),
+                            metrics=copy_metrics(conversational_metrics),
                             test_case=test_case,
                             test_run_manager=test_run_manager,
                             test_results=test_results,
@@ -499,7 +703,22 @@ async def a_execute_test_cases(
                         tasks.append(asyncio.create_task(task))
                     await asyncio.sleep(async_config.throttle_value)
-            await asyncio.gather(*tasks)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*tasks),
+                    timeout=_gather_timeout(),
+                )
+            except (asyncio.TimeoutError, TimeoutError):
+                for t in tasks:
+                    if not t.done():
+                        t.cancel()
+                await asyncio.gather(*tasks, return_exceptions=True)
+                logging.getLogger("deepeval").error(
+                    "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
+                    _gather_timeout(),
+                )
     else:
         for test_case in test_cases:
             with capture_evaluation_run("test case"):
@@ -572,7 +791,19 @@ async def a_execute_test_cases(
                     tasks.append(asyncio.create_task(task))
                 await asyncio.sleep(async_config.throttle_value)
-        await asyncio.gather(*tasks)
+        try:
+            await asyncio.wait_for(
+                asyncio.gather(*tasks),
+                timeout=_gather_timeout(),
+            )
+        except (asyncio.TimeoutError, TimeoutError):
+            # Cancel any still-pending tasks and drain them
+            for t in tasks:
+                if not t.done():
+                    t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+            raise
     return test_results
@@ -593,6 +824,7 @@ async def _a_execute_llm_test_cases(
     progress: Optional[Progress] = None,
     pbar_id: Optional[int] = None,
 ):
+    logger.info("in _a_execute_llm_test_cases")
     pbar_test_case_id = add_pbar(
         progress,
         f"    🎯 Evaluating test case #{count}",
@@ -616,64 +848,85 @@ async def _a_execute_llm_test_cases(
     api_test_case = create_api_test_case(
         test_case=test_case, index=count if not _is_assert_test else None
     )
-    new_cached_test_case: CachedTestCase = CachedTestCase()
-    test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=cached_test_case,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
+    try:
+        new_cached_test_case: CachedTestCase = CachedTestCase()
+        test_start_time = time.perf_counter()
-    for metric in metrics:
-        if metric.skipped:
-            continue
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=cached_test_case,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
-        if metric.error is None:
-            cache_metric_data = deepcopy(metric_data)
-            cache_metric_data.evaluation_cost = (
-                0  # Create new copy and save 0 for cost
-            )
-            updated_cached_metric_data = CachedMetricData(
-                metric_data=cache_metric_data,
-                metric_configuration=Cache.create_metric_configuration(metric),
-            )
-            new_cached_test_case.cached_metrics_data.append(
-                updated_cached_metric_data
-            )
+            if metric.error is None:
+                cache_metric_data = deepcopy(metric_data)
+                cache_metric_data.evaluation_cost = (
+                    0  # Create new copy and save 0 for cost
+                )
+                updated_cached_metric_data = CachedMetricData(
+                    metric_data=cache_metric_data,
+                    metric_configuration=Cache.create_metric_configuration(
+                        metric
+                    ),
+                )
+                new_cached_test_case.cached_metrics_data.append(
+                    updated_cached_metric_data
+                )
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
-    # Quick hack to check if all metrics were from cache
-    if run_duration < 1:
-        run_duration = 0
-    api_test_case.update_run_duration(run_duration)
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
-    ### Cache Test Run ###
-    global_test_run_cache_manager.cache_test_case(
-        test_case,
-        new_cached_test_case,
-        test_run.hyperparameters,
-    )
-    global_test_run_cache_manager.cache_test_case(
-        test_case,
-        new_cached_test_case,
-        test_run.hyperparameters,
-        to_temp=True,
-    )
+        test_end_time = time.perf_counter()
+        run_duration = test_end_time - test_start_time
+        # Quick hack to check if all metrics were from cache
+        if run_duration < 1:
+            run_duration = 0
+        api_test_case.update_run_duration(run_duration)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
+        ### Cache Test Run ###
+        global_test_run_cache_manager.cache_test_case(
+            test_case,
+            new_cached_test_case,
+            test_run.hyperparameters,
+        )
+        global_test_run_cache_manager.cache_test_case(
+            test_case,
+            new_cached_test_case,
+            test_run.hyperparameters,
+            to_temp=True,
+        )
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 async def _a_execute_mllm_test_cases(
@@ -705,31 +958,50 @@ async def _a_execute_mllm_test_cases(
         test_case=test_case, index=count if not _is_assert_test else None
     )
     test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=None,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
-    for metric in metrics:
-        if metric.skipped:
-            continue
+    try:
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=None,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
-    api_test_case.update_run_duration(run_duration)
+        test_end_time = time.perf_counter()
+        run_duration = test_end_time - test_start_time
+        api_test_case.update_run_duration(run_duration)
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 async def _a_execute_conversational_test_cases(
@@ -764,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
     )
     test_start_time = time.perf_counter()
-    await measure_metrics_with_indicator(
-        metrics=metrics,
-        test_case=test_case,
-        cached_test_case=None,
-        skip_on_missing_params=skip_on_missing_params,
-        ignore_errors=ignore_errors,
-        show_indicator=show_metrics_indicator,
-        pbar_eval_id=pbar_test_case_id,
-        progress=progress,
-    )
-    for metric in metrics:
-        if metric.skipped:
-            continue
-        metric_data = create_metric_data(metric)
-        api_test_case.update_metric_data(metric_data)
+    try:
+        await measure_metrics_with_indicator(
+            metrics=metrics,
+            test_case=test_case,
+            cached_test_case=None,
+            skip_on_missing_params=skip_on_missing_params,
+            ignore_errors=ignore_errors,
+            show_indicator=show_metrics_indicator,
+            pbar_eval_id=pbar_test_case_id,
+            progress=progress,
+        )
-    test_end_time = time.perf_counter()
-    if len(metrics) > 0:
-        run_duration = test_end_time - test_start_time
-        api_test_case.update_run_duration(run_duration)
+    except asyncio.CancelledError:
+        msg = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        for m in metrics:
+            if getattr(m, "skipped", False):
+                continue
+            # If the task never finished and didn't set a terminal state, mark it now
+            if getattr(m, "success", None) is None and not getattr(
+                m, "error", None
+            ):
+                m.success = False
+                m.error = msg
+        if not ignore_errors:
+            raise
+    finally:
+        for metric in metrics:
+            if metric.skipped:
+                continue
+            metric_data = create_metric_data(metric)
+            api_test_case.update_metric_data(metric_data)
+        test_end_time = time.perf_counter()
+        if len(metrics) > 0:
+            run_duration = test_end_time - test_start_time
+            api_test_case.update_run_duration(run_duration)
-    ### Update Test Run ###
-    test_run_manager.update_test_run(api_test_case, test_case)
+        ### Update Test Run ###
+        test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    update_pbar(progress, pbar_id)
+        test_results.append(create_test_result(api_test_case))
+        update_pbar(progress, pbar_id)
 ###########################################
@@ -814,7 +1108,11 @@ def execute_agentic_test_cases(
     test_run_manager = global_test_run_manager
     test_run_manager.save_to_disk = cache_config.write_cache
-    test_run_manager.get_test_run(identifier=identifier)
+    test_run = test_run_manager.get_test_run(identifier=identifier)
+    if test_run is None:
+        # Create if not found
+        test_run_manager.create_test_run(identifier=identifier)
+        test_run = test_run_manager.get_test_run(identifier=identifier)
     local_trace_manager = trace_manager
     local_trace_manager.evaluating = True
@@ -824,244 +1122,526 @@ def execute_agentic_test_cases(
         progress: Optional[Progress] = None,
         pbar_id: Optional[int] = None,
     ):
-        count = 0
+        count = -1
         show_metric_indicator = (
             display_config.show_indicator and not _use_bar_indicator
         )
         for golden in goldens:
-            with capture_evaluation_run("golden"):
-                count += 1
-                total_tags = count_observe_decorators_in_module(
-                    observed_callback
-                )
-                pbar_tags_id = add_pbar(
-                    progress,
-                    f"     ⚡ Invoking observed callback (#{count})",
-                    total=total_tags,
-                )
-                with Observer(
-                    "custom",
-                    func_name="Test Wrapper",
-                    _progress=progress,
-                    _pbar_callback_id=pbar_tags_id,
-                ):
-                    if asyncio.iscoroutinefunction(observed_callback):
-                        loop = get_or_create_event_loop()
-                        coro = observed_callback(golden.input)
-                        loop.run_until_complete(
-                            asyncio.wait_for(
-                                coro,
-                                timeout=_per_task_timeout(),
-                            )
-                        )
-                    else:
-                        observed_callback(golden.input)
-                    current_trace: Trace = current_trace_context.get()
+            count += 1
-                update_pbar(progress, pbar_tags_id, advance=total_tags)
-                update_pbar(progress, pbar_id)
+            pbar_case_increments = (
+                0  # tracks how many times we advance `pbar_id` for this golden
+            )
+            emitted_trace = set()
+            current_trace: Optional[Trace] = None
+            trace_api = None
+            api_test_case = None
+            test_case = None
+            def _run_golden():
+                nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
+                # keep the evaluation context inside the timed function
+                with capture_evaluation_run("golden"):
+                    total_tags = count_observe_decorators_in_module(
+                        observed_callback
+                    )
+                    pbar_tags_id = add_pbar(
+                        progress,
+                        f"     ⚡ Invoking observed callback (#{count})",
+                        total=total_tags,
+                    )
-                # Create empty trace api for llm api test case
-                trace_api = create_api_trace(current_trace, golden)
+                    with Observer(
+                        "custom",
+                        func_name="Test Wrapper",
+                        _progress=progress,
+                        _pbar_callback_id=pbar_tags_id,
+                    ):
+                        if asyncio.iscoroutinefunction(observed_callback):
+                            loop = get_or_create_event_loop()
+                            coro = observed_callback(golden.input)
+                            loop.run_until_complete(
+                                _await_with_outer_deadline(
+                                    coro,
+                                    timeout=_per_task_timeout(),
+                                )
+                            )
+                        else:
+                            observed_callback(golden.input)
-                # Format golden as test case to create llm api test case
-                test_case = LLMTestCase(
-                    input=golden.input,
-                    actual_output=(
-                        str(current_trace.output)
-                        if current_trace.output is not None
-                        else None
-                    ),
-                    expected_output=current_trace.expected_output,
-                    context=current_trace.context,
-                    retrieval_context=current_trace.retrieval_context,
-                    additional_metadata=golden.additional_metadata,
-                    tools_called=current_trace.tools_called,
-                    expected_tools=current_trace.expected_tools,
-                    comments=golden.comments,
-                    name=golden.name,
-                    _dataset_alias=golden._dataset_alias,
-                    _dataset_id=golden._dataset_id,
-                )
-                api_test_case = create_api_test_case(
-                    test_case=test_case,
-                    trace=trace_api,
-                    index=count if not _is_assert_test else None,
-                )
+                        # we have a trace now
+                        current_trace = current_trace_context.get()
-                # Run DFS to calculate metrics synchronously
-                def dfs(
-                    span: BaseSpan,
-                    progress: Optional[Progress] = None,
-                    pbar_eval_id: Optional[int] = None,
-                ):
-                    # Create API Span
-                    metrics: List[BaseMetric] = span.metrics
-                    api_span: BaseApiSpan = (
-                        trace_manager._convert_span_to_api_span(span)
+                    update_pbar(progress, pbar_tags_id, advance=total_tags)
+                    update_pbar(progress, pbar_id)
+                    pbar_case_increments += 1
+                    # Create empty trace api for llm api test case
+                    trace_api = create_api_trace(current_trace, golden)
+                    # Build the test case and api test case
+                    test_case = LLMTestCase(
+                        input=golden.input,
+                        actual_output=(
+                            str(current_trace.output)
+                            if current_trace
+                            and current_trace.output is not None
+                            else None
+                        ),
+                        expected_output=(
+                            current_trace.expected_output
+                            if current_trace
+                            else None
+                        ),
+                        context=(
+                            current_trace.context if current_trace else None
+                        ),
+                        retrieval_context=(
+                            current_trace.retrieval_context
+                            if current_trace
+                            else None
+                        ),
+                        additional_metadata=golden.additional_metadata,
+                        tools_called=(
+                            current_trace.tools_called
+                            if current_trace
+                            else None
+                        ),
+                        expected_tools=(
+                            current_trace.expected_tools
+                            if current_trace
+                            else None
+                        ),
+                        comments=golden.comments,
+                        name=golden.name,
+                        _dataset_alias=golden._dataset_alias,
+                        _dataset_id=golden._dataset_id,
                     )
-                    if isinstance(span, AgentSpan):
-                        trace_api.agent_spans.append(api_span)
-                    elif isinstance(span, LlmSpan):
-                        trace_api.llm_spans.append(api_span)
-                        log_prompt(span, test_run_manager)
-                    elif isinstance(span, RetrieverSpan):
-                        trace_api.retriever_spans.append(api_span)
-                    elif isinstance(span, ToolSpan):
-                        trace_api.tool_spans.append(api_span)
-                    else:
-                        trace_api.base_spans.append(api_span)
-                    for child in span.children:
-                        dfs(child, progress, pbar_eval_id)
-                    if span.metrics is None:
-                        return
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in span.metrics
+                    api_test_case = create_api_test_case(
+                        test_case=test_case,
+                        trace=trace_api,
+                        index=count if not _is_assert_test else None,
                     )
-                    llm_test_case = None
-                    if span.input is not None:
-                        llm_test_case = LLMTestCase(
-                            input=str(span.input),
-                            actual_output=(
-                                str(span.output)
-                                if span.output is not None
-                                else None
-                            ),
-                            expected_output=span.expected_output,
-                            context=span.context,
-                            retrieval_context=span.retrieval_context,
-                            tools_called=span.tools_called,
-                            expected_tools=span.expected_tools,
+                    # DFS and trace metric evaluation
+                    def dfs(
+                        span: BaseSpan,
+                        progress: Optional[Progress] = None,
+                        pbar_eval_id: Optional[int] = None,
+                    ):
+                        metrics: List[BaseMetric] = list(span.metrics or [])
+                        api_span: BaseApiSpan = (
+                            trace_manager._convert_span_to_api_span(span)
                         )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
+                        if isinstance(span, AgentSpan):
+                            trace_api.agent_spans.append(api_span)
+                        elif isinstance(span, LlmSpan):
+                            trace_api.llm_spans.append(api_span)
+                            log_prompt(span, test_run_manager)
+                        elif isinstance(span, RetrieverSpan):
+                            trace_api.retriever_spans.append(api_span)
+                        elif isinstance(span, ToolSpan):
+                            trace_api.tool_spans.append(api_span)
+                        else:
+                            trace_api.base_spans.append(api_span)
+                        if _skip_metrics_for_error(
+                            span=span, trace=current_trace
+                        ):
+                            api_span.status = TraceSpanApiStatus.ERRORED
+                            api_span.error = span.error or _trace_error(
+                                current_trace
+                            )
+                            if progress and pbar_eval_id is not None:
+                                update_pbar(
+                                    progress,
+                                    pbar_eval_id,
+                                    advance=count_metrics_in_span_subtree(span),
+                                )
+                            return
+                        # evaluate children first
+                        for child in span.children:
+                            dfs(child, progress, pbar_eval_id)
+                        # If there are no metrics, then there is nothing to do on this span.
+                        if not metrics:
+                            return
+                        has_task_completion = any(
+                            isinstance(metric, TaskCompletionMetric)
+                            for metric in metrics
                         )
-                    # add trace if task completion
-                    if has_task_completion:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(span)
+                        requires_trace = any(
+                            getattr(metric, "requires_trace", False)
+                            for metric in metrics
                         )
-                    # Preparing metric calculation
-                    api_span.metrics_data = []
-                    for metric in metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
+                        llm_test_case = None
+                        if span.input is not None:
+                            llm_test_case = LLMTestCase(
+                                input=str(span.input),
+                                actual_output=(
+                                    str(span.output)
+                                    if span.output is not None
+                                    else None
+                                ),
+                                expected_output=span.expected_output,
+                                context=span.context,
+                                retrieval_context=span.retrieval_context,
+                                tools_called=span.tools_called,
+                                expected_tools=span.expected_tools,
+                            )
-                    # Metric calculation
-                    for metric in metrics:
-                        metric_data = None
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        metric_data = create_metric_data(metric)
-                        api_span.metrics_data.append(metric_data)
-                        api_test_case.update_status(metric_data.success)
-                        update_pbar(progress, pbar_eval_id)
+                        # If any metric needs a trace tree or a completion verdict, attach the trace
+                        if has_task_completion or requires_trace:
+                            if llm_test_case is None:
+                                llm_test_case = LLMTestCase(input="None")
+                            llm_test_case._trace_dict = (
+                                trace_manager.create_nested_spans_dict(span)
+                            )
+                        else:
+                            # Without a test case we cannot evaluate span metrics
+                            if llm_test_case is None:
+                                api_span.status = TraceSpanApiStatus.ERRORED
+                                api_span.error = format_error_text(
+                                    DeepEvalError(
+                                        "Span has metrics but no LLMTestCase. "
+                                        "Are you sure you called `update_current_span()`?"
+                                    )
+                                )
+                                if progress and pbar_eval_id is not None:
+                                    update_pbar(
+                                        progress,
+                                        pbar_eval_id,
+                                        advance=count_metrics_in_span_subtree(
+                                            span
+                                        ),
+                                    )
+                                return
+                        # Preparing metric calculation
+                        api_span.metrics_data = []
+                        for metric in metrics:
+                            metric.skipped = False
+                            metric.error = None
+                            if display_config.verbose_mode is not None:
+                                metric.verbose_mode = (
+                                    display_config.verbose_mode
+                                )
+                        # Metric calculation
+                        for metric in metrics:
+                            res = _execute_metric(
+                                metric=metric,
+                                test_case=llm_test_case,
+                                show_metric_indicator=show_metric_indicator,
+                                in_component=True,
+                                error_config=error_config,
+                            )
+                            if res == "skip":
+                                continue
+                            metric_data = create_metric_data(metric)
+                            api_span.metrics_data.append(metric_data)
+                            api_test_case.update_status(metric_data.success)
+                            update_pbar(progress, pbar_eval_id)
-                trace_level_metrics_count = (
-                    len(current_trace.metrics) if current_trace.metrics else 0
-                )
-                pbar_eval_id = add_pbar(
-                    progress,
-                    f"     🎯 Evaluating component(s) (#{count})",
-                    total=count_metrics_in_trace(trace=current_trace)
-                    + trace_level_metrics_count,
+                    trace_level_metrics_count = (
+                        len(current_trace.metrics)
+                        if current_trace and current_trace.metrics
+                        else 0
+                    )
+                    pbar_eval_id = add_pbar(
+                        progress,
+                        f"     🎯 Evaluating component(s) (#{count})",
+                        total=count_metrics_in_trace(trace=current_trace)
+                        + trace_level_metrics_count,
+                    )
+                    start_time = time.perf_counter()
+                    skip_metrics_for_this_golden = False
+                    if _skip_metrics_for_error(trace=current_trace):
+                        trace_api.status = TraceSpanApiStatus.ERRORED
+                        if progress and pbar_eval_id is not None:
+                            update_pbar(
+                                progress,
+                                pbar_eval_id,
+                                advance=count_total_metrics_for_trace(
+                                    current_trace
+                                ),
+                            )
+                    else:
+                        if current_trace and current_trace.metrics:
+                            has_task_completion = any(
+                                isinstance(metric, TaskCompletionMetric)
+                                for metric in current_trace.metrics
+                            )
+                            requires_trace = any(
+                                getattr(metric, "requires_trace", False)
+                                for metric in current_trace.metrics
+                            )
+                            llm_test_case = None
+                            if current_trace.input:
+                                llm_test_case = LLMTestCase(
+                                    input=str(current_trace.input),
+                                    actual_output=(
+                                        str(current_trace.output)
+                                        if current_trace.output is not None
+                                        else None
+                                    ),
+                                    expected_output=current_trace.expected_output,
+                                    context=current_trace.context,
+                                    retrieval_context=current_trace.retrieval_context,
+                                    tools_called=current_trace.tools_called,
+                                    expected_tools=current_trace.expected_tools,
+                                )
+                            if has_task_completion or requires_trace:
+                                if llm_test_case is None:
+                                    llm_test_case = LLMTestCase(input="None")
+                                llm_test_case._trace_dict = (
+                                    trace_manager.create_nested_spans_dict(
+                                        current_trace.root_spans[0]
+                                    )
+                                )
+                            else:
+                                if llm_test_case is None:
+                                    current_trace.status = (
+                                        TraceSpanStatus.ERRORED
+                                    )
+                                    trace_api.status = (
+                                        TraceSpanApiStatus.ERRORED
+                                    )
+                                    if current_trace.root_spans:
+                                        current_trace.root_spans[0].status = (
+                                            TraceSpanStatus.ERRORED
+                                        )
+                                        current_trace.root_spans[0].error = (
+                                            format_error_text(
+                                                DeepEvalError(
+                                                    "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                    "Are you sure you called `update_current_trace()`?"
+                                                )
+                                            )
+                                        )
+                                    if progress and pbar_eval_id is not None:
+                                        update_pbar(
+                                            progress,
+                                            pbar_eval_id,
+                                            advance=count_total_metrics_for_trace(
+                                                current_trace
+                                            ),
+                                        )
+                                    skip_metrics_for_this_golden = True
+                            if not skip_metrics_for_this_golden:
+                                for metric in current_trace.metrics:
+                                    metric.skipped = False
+                                    metric.error = None
+                                    if display_config.verbose_mode is not None:
+                                        metric.verbose_mode = (
+                                            display_config.verbose_mode
+                                        )
+                                trace_api.metrics_data = []
+                                for metric in current_trace.metrics:
+                                    res = _execute_metric(
+                                        metric=metric,
+                                        test_case=llm_test_case,
+                                        show_metric_indicator=show_metric_indicator,
+                                        in_component=True,
+                                        error_config=error_config,
+                                    )
+                                    if res == "skip":
+                                        continue
+                                    if not metric.skipped:
+                                        metric_data = create_metric_data(metric)
+                                        trace_api.metrics_data.append(
+                                            metric_data
+                                        )
+                                        api_test_case.update_metric_data(
+                                            metric_data
+                                        )
+                                        api_test_case.update_status(
+                                            metric_data.success
+                                        )
+                                        emitted_trace.add(id(metric))
+                                        update_pbar(progress, pbar_eval_id)
+                            # handle span metrics
+                            dfs(
+                                current_trace.root_spans[0],
+                                progress,
+                                pbar_eval_id,
+                            )
+                    # TODO: Do I need this block, or is it duplicated in finally?
+                    end_time = time.perf_counter()
+                    run_duration = end_time - start_time
+                    api_test_case.update_run_duration(run_duration)
+                    test_run_manager.update_test_run(api_test_case, test_case)
+                    test_results.append(create_test_result(api_test_case))
+                    test_results.extend(extract_trace_test_results(trace_api))
+                    update_pbar(progress, pbar_id)
+                    pbar_case_increments += 1
+            # run the golden with a timeout
+            start_time = time.perf_counter()
+            deadline = _per_task_timeout()
+            try:
+                run_sync_with_timeout(_run_golden, deadline)
+            except (asyncio.TimeoutError, TimeoutError):
+                # mark any not yet finished trace level and span level metrics as timed out.
+                msg = (
+                    f"Timed out after {deadline:.2f}s while executing agentic test case. "
+                    "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+                    "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
                 )
-                start_time = time.perf_counter()
+                if current_trace is not None:
+                    # Trace-level metrics
+                    if getattr(current_trace, "metrics", None):
+                        for m in current_trace.metrics:
+                            if getattr(m, "skipped", False):
+                                continue
+                            # if already has a terminal state, leave it alone
+                            if getattr(
+                                m, "success", None
+                            ) is not None or getattr(m, "error", None):
+                                continue
+                            m.success = False
+                            m.error = msg
+                    # span level metrics, walk the tree
+                    def _walk(span):
+                        for child in getattr(span, "children", []) or []:
+                            _walk(child)
+                        for m in list(getattr(span, "metrics", []) or []):
+                            if getattr(m, "skipped", False):
+                                continue
+                            if getattr(
+                                m, "success", None
+                            ) is not None or getattr(m, "error", None):
+                                continue
+                            m.success = False
+                            m.error = msg
-                # Handle trace-level metrics
-                if current_trace.metrics:
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in current_trace.metrics
-                    )
+                    for root in getattr(current_trace, "root_spans", []) or []:
+                        _walk(root)
-                    llm_test_case = None
-                    if current_trace.input:
-                        llm_test_case = LLMTestCase(
-                            input=str(current_trace.input),
-                            actual_output=(
+                # raise if we are not ignoring errors
+                if not error_config.ignore_errors:
+                    raise
+            finally:
+                try:
+                    # Ensure we have an api_test_case to attach results to.
+                    if api_test_case is None:
+                        # build a minimal test_case
+                        if test_case is None:
+                            out = (
                                 str(current_trace.output)
-                                if current_trace.output is not None
+                                if (
+                                    current_trace is not None
+                                    and current_trace.output is not None
+                                )
                                 else None
-                            ),
-                            expected_output=current_trace.expected_output,
-                            context=current_trace.context,
-                            retrieval_context=current_trace.retrieval_context,
-                            tools_called=current_trace.tools_called,
-                            expected_tools=current_trace.expected_tools,
-                        )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
-                        )
-                    if has_task_completion:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(
-                                current_trace.root_spans[0]
                             )
-                        )
+                            test_case = LLMTestCase(
+                                input=golden.input,
+                                actual_output=out,
+                                expected_output=(
+                                    current_trace.expected_output
+                                    if current_trace
+                                    else None
+                                ),
+                                context=(
+                                    current_trace.context
+                                    if current_trace
+                                    else None
+                                ),
+                                retrieval_context=(
+                                    current_trace.retrieval_context
+                                    if current_trace
+                                    else None
+                                ),
+                                additional_metadata=golden.additional_metadata,
+                                tools_called=(
+                                    current_trace.tools_called
+                                    if current_trace
+                                    else None
+                                ),
+                                expected_tools=(
+                                    current_trace.expected_tools
+                                    if current_trace
+                                    else None
+                                ),
+                                comments=golden.comments,
+                                name=golden.name,
+                                _dataset_alias=golden._dataset_alias,
+                                _dataset_id=golden._dataset_id,
+                            )
-                    for metric in current_trace.metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
+                        # Create a trace API if we have a trace
+                        if trace_api is None and current_trace is not None:
+                            trace_api = create_api_trace(current_trace, golden)
-                    trace_api.metrics_data = []
-                    for metric in current_trace.metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
+                        api_test_case = create_api_test_case(
+                            test_case=test_case,
+                            trace=trace_api,
+                            index=count if not _is_assert_test else None,
                         )
-                        if res == "skip":
-                            continue
-                        if not metric.skipped:
-                            metric_data = create_metric_data(metric)
-                            trace_api.metrics_data.append(metric_data)
-                            api_test_case.update_metric_data(metric_data)
-                            api_test_case.update_status(metric_data.success)
-                            update_pbar(progress, pbar_eval_id)
+                    if test_run is not None:
+                        test_run_manager.set_test_run(test_run)
+                    if api_test_case.success is None:
+                        api_test_case.update_status(False)
+                    # try to update metric data
+                    if current_trace is not None:
+                        if current_trace.metrics:
+                            for m in current_trace.metrics:
+                                if getattr(m, "skipped", False):
+                                    continue
+                                if id(m) in emitted_trace:
+                                    continue
+                                api_test_case.update_metric_data(
+                                    create_metric_data(m)
+                                )
+                    # Finalize duration and persist
+                    elapsed = time.perf_counter() - start_time
+                    api_test_case.update_run_duration(
+                        elapsed if elapsed >= 0 else deadline
+                    )
+                    if (
+                        api_test_case.metrics_data == []
+                        and api_test_case.trace is None
+                    ):
+                        api_test_case.metrics_data = None
-                # Then handle span-level metrics
-                dfs(current_trace.root_spans[0], progress, pbar_eval_id)
-                end_time = time.perf_counter()
-                run_duration = end_time - start_time
+                    test_run_manager.update_test_run(api_test_case, test_case)
+                    test_results.append(create_test_result(api_test_case))
+                    if trace_api is not None:
+                        test_results.extend(
+                            extract_trace_test_results(trace_api)
+                        )
-                # Update test run
-                api_test_case.update_run_duration(run_duration)
-                test_run_manager.update_test_run(api_test_case, test_case)
-                test_results.append(create_test_result(api_test_case))
-                test_results.extend(extract_trace_test_results(trace_api))
+                    missing = 2 - pbar_case_increments
+                    if missing > 0:
+                        update_pbar(progress, pbar_id, advance=missing)
-                update_pbar(progress, pbar_id)
+                finally:
+                    # nothing to clean here, but keep symmetry with other paths
+                    pass
     if display_config.show_indicator and _use_bar_indicator:
         progress = Progress(
@@ -1102,7 +1682,10 @@ async def a_execute_agentic_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
+            )
     test_run_manager = global_test_run_manager
     test_run_manager.save_to_disk = cache_config.write_cache
@@ -1149,7 +1732,19 @@ async def a_execute_agentic_test_cases(
                     tasks.append(asyncio.create_task(task))
                     await asyncio.sleep(async_config.throttle_value)
-            await asyncio.gather(*tasks)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*tasks),
+                    timeout=_gather_timeout(),
+                )
+            except (asyncio.TimeoutError, TimeoutError):
+                # Cancel any still-pending tasks and drain them
+                for t in tasks:
+                    if not t.done():
+                        t.cancel()
+                await asyncio.gather(*tasks, return_exceptions=True)
+                raise
     else:
         for golden in goldens:
             with capture_evaluation_run("golden"):
@@ -1194,93 +1789,89 @@ async def _a_execute_agentic_test_case(
     progress: Optional[Progress] = None,
     pbar_id: Optional[int] = None,
 ):
-    if observed_callback:
-        total_tags = count_observe_decorators_in_module(observed_callback)
-        pbar_tags_id = add_pbar(
-            progress,
-            f"     ⚡ Invoking observed callback (#{count})",
-            total=total_tags,
-        )
+    test_start_time = time.perf_counter()
+    current_trace = None
+    trace_api = None
+    test_case = None
+    api_test_case = None
+    try:
+        if observed_callback:
+            total_tags = count_observe_decorators_in_module(observed_callback)
+            pbar_tags_id = add_pbar(
+                progress,
+                f"     ⚡ Invoking observed callback (#{count})",
+                total=total_tags,
+            )
-        # Call callback and extract trace
-        with Observer(
-            "custom",
-            func_name="Test Wrapper",
-            _progress=progress,
-            _pbar_callback_id=pbar_tags_id,
-        ):
-            if asyncio.iscoroutinefunction(observed_callback):
-                await asyncio.wait_for(
-                    observed_callback(golden.input),
-                    timeout=_per_task_timeout(),
-                )
-            else:
-                observed_callback(golden.input)
-            current_trace: Trace = current_trace_context.get()
+            # Call callback and extract trace
+            with Observer(
+                "custom",
+                func_name="Test Wrapper",
+                _progress=progress,
+                _pbar_callback_id=pbar_tags_id,
+            ):
+                # get current_trace right away, we need it even if cancelled
+                current_trace: Trace = current_trace_context.get()
+                if asyncio.iscoroutinefunction(observed_callback):
+                    await _await_with_outer_deadline(
+                        observed_callback,
+                        golden.input,
+                        timeout=_per_task_timeout(),
+                    )
+                else:
+                    observed_callback(golden.input)
-        update_pbar(progress, pbar_tags_id, advance=total_tags)
-        update_pbar(progress, pbar_id)
+            update_pbar(progress, pbar_tags_id, advance=total_tags)
+            update_pbar(progress, pbar_id)
-    elif trace:
-        current_trace = trace
+        elif trace:
+            current_trace = trace
-    if trace_metrics:
-        current_trace.metrics = trace_metrics
+        trace_level_metrics_count = 0
-    # run evals through DFS
-    trace_api = create_api_trace(trace=current_trace, golden=golden)
+        if trace_metrics:
+            current_trace.metrics = trace_metrics
-    trace_level_metrics_count = (
-        len(current_trace.metrics) if current_trace.metrics else 0
-    )
+        # run evals through DFS
+        trace_api = create_api_trace(trace=current_trace, golden=golden)
-    pbar_eval_id = add_pbar(
-        progress,
-        f"     🎯 Evaluating component(s) (#{count})",
-        total=count_metrics_in_trace(trace=current_trace)
-        + trace_level_metrics_count,
-    )
+        trace_level_metrics_count = (
+            len(current_trace.metrics) if current_trace.metrics else 0
+        )
-    test_case = LLMTestCase(
-        input=golden.input,
-        actual_output=(
-            str(current_trace.output)
-            if current_trace.output is not None
-            else None
-        ),
-        expected_output=current_trace.expected_output,
-        context=current_trace.context,
-        retrieval_context=current_trace.retrieval_context,
-        tools_called=current_trace.tools_called,
-        expected_tools=current_trace.expected_tools,
-        additional_metadata=golden.additional_metadata,
-        comments=golden.comments,
-        name=golden.name,
-        _dataset_alias=golden._dataset_alias,
-        _dataset_id=golden._dataset_id,
-    )
-    api_test_case = create_api_test_case(
-        test_case=test_case,
-        trace=trace_api,
-        index=count if not _is_assert_test else None,
-    )
+        pbar_eval_id = add_pbar(
+            progress,
+            f"     🎯 Evaluating component(s) (#{count})",
+            total=count_metrics_in_trace(trace=current_trace)
+            + trace_level_metrics_count,
+        )
-    await _a_execute_trace_test_case(
-        trace=trace,
-        trace_api=trace_api,
-        api_test_case=api_test_case,
-        ignore_errors=ignore_errors,
-        skip_on_missing_params=skip_on_missing_params,
-        show_indicator=show_indicator,
-        verbose_mode=verbose_mode,
-        progress=progress,
-        pbar_eval_id=pbar_eval_id,
-        _use_bar_indicator=_use_bar_indicator,
-    )
+        test_case = LLMTestCase(
+            input=golden.input,
+            actual_output=(
+                str(current_trace.output)
+                if current_trace.output is not None
+                else None
+            ),
+            expected_output=current_trace.expected_output,
+            context=current_trace.context,
+            retrieval_context=current_trace.retrieval_context,
+            tools_called=current_trace.tools_called,
+            expected_tools=current_trace.expected_tools,
+            additional_metadata=golden.additional_metadata,
+            comments=golden.comments,
+            name=golden.name,
+            _dataset_alias=golden._dataset_alias,
+            _dataset_id=golden._dataset_id,
+        )
+        api_test_case = create_api_test_case(
+            test_case=test_case,
+            trace=trace_api,
+            index=count if not _is_assert_test else None,
+        )
-    async def dfs(span: BaseSpan):
-        await _a_execute_span_test_case(
-            span=span,
+        await _a_execute_trace_test_case(
+            trace=current_trace,
             trace_api=trace_api,
             api_test_case=api_test_case,
             ignore_errors=ignore_errors,
@@ -1289,39 +1880,155 @@ async def _a_execute_agentic_test_case(
             verbose_mode=verbose_mode,
             progress=progress,
             pbar_eval_id=pbar_eval_id,
-            test_run_manager=test_run_manager,
             _use_bar_indicator=_use_bar_indicator,
         )
-        child_tasks = [dfs(child) for child in span.children]
-        if child_tasks:
-            await asyncio.gather(*child_tasks)
-    test_start_time = time.perf_counter()
-    if current_trace and current_trace.root_spans:
-        await dfs(current_trace.root_spans[0])
-    else:
-        if (
-            logger.isEnabledFor(logging.DEBUG)
-            and get_settings().DEEPEVAL_VERBOSE_MODE
-        ):
-            logger.debug(
-                "Skipping DFS: empty trace or no root spans (trace=%s)",
-                current_trace.uuid if current_trace else None,
+        async def dfs(trace: Trace, span: BaseSpan):
+            await _a_execute_span_test_case(
+                span=span,
+                current_trace=trace,
+                trace_api=trace_api,
+                api_test_case=api_test_case,
+                ignore_errors=ignore_errors,
+                skip_on_missing_params=skip_on_missing_params,
+                show_indicator=show_indicator,
+                verbose_mode=verbose_mode,
+                progress=progress,
+                pbar_eval_id=pbar_eval_id,
+                test_run_manager=test_run_manager,
+                _use_bar_indicator=_use_bar_indicator,
             )
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - test_start_time
+            if _skip_metrics_for_error(span=span, trace=trace):
+                return
+            child_tasks = [
+                asyncio.create_task(dfs(trace, child))
+                for child in span.children
+            ]
+            if child_tasks:
+                try:
+                    await asyncio.wait_for(
+                        asyncio.gather(*child_tasks),
+                        timeout=_gather_timeout(),
+                    )
+                except (asyncio.TimeoutError, TimeoutError):
+                    for t in child_tasks:
+                        if not t.done():
+                            t.cancel()
+                    await asyncio.gather(*child_tasks, return_exceptions=True)
+                    raise
+        if not _skip_metrics_for_error(trace=current_trace):
+            if current_trace and current_trace.root_spans:
+                await dfs(current_trace, current_trace.root_spans[0])
+            else:
+                if (
+                    logger.isEnabledFor(logging.DEBUG)
+                    and get_settings().DEEPEVAL_VERBOSE_MODE
+                ):
+                    logger.debug(
+                        "Skipping DFS: empty trace or no root spans (trace=%s)",
+                        current_trace.uuid if current_trace else None,
+                    )
+    except asyncio.CancelledError:
+        # mark any unfinished metrics as cancelled
+        cancel_msg = (
+            "Timed out/cancelled while evaluating agentic test case. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        if trace_metrics:
+            for m in trace_metrics:
+                if getattr(m, "skipped", False):
+                    continue
+                if getattr(m, "success", None) is None and not getattr(
+                    m, "error", None
+                ):
+                    m.success = False
+                    m.error = cancel_msg
+        if trace is not None and trace.metrics:
+            for m in trace.metrics:
+                if getattr(m, "skipped", False):
+                    continue
+                if getattr(m, "success", None) is None and not getattr(
+                    m, "error", None
+                ):
+                    m.success = False
+                    m.error = cancel_msg
+        if not ignore_errors:
+            raise
+    finally:
+        try:
+            if api_test_case is None:
+                if test_case is None:
+                    test_case = LLMTestCase(
+                        input=golden.input,
+                        actual_output=None,
+                        expected_output=None,
+                        context=None,
+                        retrieval_context=None,
+                        additional_metadata=golden.additional_metadata,
+                        tools_called=None,
+                        expected_tools=None,
+                        comments=golden.comments,
+                        name=golden.name,
+                        _dataset_alias=golden._dataset_alias,
+                        _dataset_id=golden._dataset_id,
+                    )
+                if trace is not None and trace_api is None:
+                    trace_api = create_api_trace(trace, golden)
-    api_test_case.update_run_duration(run_duration)
-    test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    test_results.extend(extract_trace_test_results(trace_api))
+                api_test_case = create_api_test_case(
+                    test_case=test_case,
+                    trace=trace_api,
+                    index=(count if not _is_assert_test else None),
+                )
-    update_pbar(progress, pbar_id)
+            # attach MetricData for any trace metrics we marked above
+            if trace_metrics:
+                for m in trace_metrics:
+                    if getattr(m, "skipped", False):
+                        continue
+                    api_test_case.update_metric_data(create_metric_data(m))
+            # If nothing set success yet, mark the case failed
+            if api_test_case.success is None:
+                api_test_case.update_status(False)
+            # test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
+            # Set it to None to ensure the test_case is added
+            if api_test_case.metrics_data == [] and api_test_case.trace is None:
+                api_test_case.metrics_data = None
+            # Duration & persist
+            test_end_time = time.perf_counter()
+            run_duration = test_end_time - test_start_time
+            api_test_case.update_run_duration(run_duration)
+            test_run_manager.update_test_run(api_test_case, test_case)
+            # Build results and de-duplicate against trace results
+            main_result = create_test_result(api_test_case)
+            trace_results = (
+                extract_trace_test_results(trace_api)
+                if trace_api is not None
+                else []
+            )
+            unique_trace_results = filter_duplicate_results(
+                main_result, trace_results
+            )
+            test_results.append(main_result)
+            test_results.extend(unique_trace_results)
+            update_pbar(progress, pbar_id)
+        finally:
+            pass
 async def _a_execute_span_test_case(
     span: BaseSpan,
+    current_trace: Trace,
     trace_api: TraceApi,
     api_test_case: LLMApiTestCase,
     ignore_errors: bool,
@@ -1346,12 +2053,22 @@ async def _a_execute_span_test_case(
     else:
         trace_api.base_spans.append(api_span)
-    if span.metrics is None:
+    if _skip_metrics_for_error(span=span, trace=current_trace):
+        api_span.status = TraceSpanApiStatus.ERRORED
+        api_span.error = span.error or _trace_error(current_trace)
+        if progress and pbar_eval_id is not None:
+            update_pbar(
+                progress,
+                pbar_eval_id,
+                advance=count_metrics_in_span_subtree(span),
+            )
+        return
+    metrics: List[BaseMetric] = list(span.metrics or [])
+    if not metrics:
         return
-    has_task_completion = any(
-        isinstance(metric, TaskCompletionMetric) for metric in span.metrics
-    )
+    requires_trace = any(metric.requires_trace for metric in metrics)
     llm_test_case = None
     if span.input:
@@ -1364,17 +2081,29 @@ async def _a_execute_span_test_case(
             tools_called=span.tools_called,
             expected_tools=span.expected_tools,
         )
-    if llm_test_case is None and not has_task_completion:
-        raise ValueError(
-            "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
-        )
+    if not requires_trace:
+        if llm_test_case is None:
+            api_span.status = TraceSpanApiStatus.ERRORED
+            api_span.error = format_error_text(
+                DeepEvalError(
+                    "Span has metrics but no LLMTestCase. "
+                    "Are you sure you called `update_current_span()`?"
+                )
+            )
+            if progress and pbar_eval_id is not None:
+                update_pbar(
+                    progress,
+                    pbar_eval_id,
+                    advance=count_metrics_in_span_subtree(span),
+                )
+            return
     show_metrics_indicator = show_indicator and not _use_bar_indicator
-    metrics: List[BaseMetric] = span.metrics
     test_case: Optional[LLMTestCase] = llm_test_case
     # add trace if task completion
-    if has_task_completion:
+    if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
         test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
@@ -1418,12 +2147,22 @@ async def _a_execute_trace_test_case(
     pbar_eval_id: Optional[int],
     _use_bar_indicator: bool,
 ):
-    if trace.metrics is None:
+    if _skip_metrics_for_error(trace=trace):
+        trace_api.status = TraceSpanApiStatus.ERRORED
+        if progress and pbar_eval_id is not None:
+            update_pbar(
+                progress,
+                pbar_eval_id,
+                advance=count_total_metrics_for_trace(trace),
+            )
         return
-    has_task_completion = any(
-        isinstance(metric, TaskCompletionMetric) for metric in trace.metrics
-    )
+    metrics: List[BaseMetric] = list(trace.metrics or [])
+    if not metrics:
+        return
+    requires_trace = any(metric.requires_trace for metric in metrics)
     llm_test_case = None
     if trace.input:
@@ -1438,17 +2177,32 @@ async def _a_execute_trace_test_case(
             tools_called=trace.tools_called,
             expected_tools=trace.expected_tools,
         )
-    if llm_test_case is None and not has_task_completion:
-        raise ValueError(
-            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
-        )
+    if not requires_trace:
+        if llm_test_case is None:
+            trace.status = TraceSpanStatus.ERRORED
+            trace_api.status = TraceSpanApiStatus.ERRORED
+            if trace.root_spans:
+                trace.root_spans[0].status = TraceSpanStatus.ERRORED
+                trace.root_spans[0].error = format_error_text(
+                    DeepEvalError(
+                        "Trace has metrics but no LLMTestCase (missing input/output). "
+                        "Are you sure you called `update_current_trace()`?"
+                    )
+                )
+            if progress and pbar_eval_id is not None:
+                update_pbar(
+                    progress,
+                    pbar_eval_id,
+                    advance=count_total_metrics_for_trace(trace),
+                )
+            return
     show_metrics_indicator = show_indicator and not _use_bar_indicator
-    metrics: List[BaseMetric] = trace.metrics
     test_case: Optional[LLMTestCase] = llm_test_case
     # add trace if task completion
-    if has_task_completion:
+    if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
         test_case._trace_dict = trace_manager.create_nested_spans_dict(
@@ -1578,11 +2332,12 @@ def execute_agentic_test_cases_from_loop(
                     pbar_eval_id: Optional[int] = None,
                 ):
                     # Create API Span
-                    metrics: List[BaseMetric] = span.metrics
+                    metrics: List[BaseMetric] = list(span.metrics or [])
                     api_span: BaseApiSpan = (
                         trace_manager._convert_span_to_api_span(span)
                     )
                     if isinstance(span, AgentSpan):
                         trace_api.agent_spans.append(api_span)
                     elif isinstance(span, LlmSpan):
@@ -1595,9 +2350,30 @@ def execute_agentic_test_cases_from_loop(
                     else:
                         trace_api.base_spans.append(api_span)
+                    # Skip errored trace/span
+                    if _skip_metrics_for_error(span=span, trace=current_trace):
+                        api_span.status = TraceSpanApiStatus.ERRORED
+                        api_span.error = span.error or _trace_error(
+                            current_trace
+                        )
+                        if progress and pbar_eval_id is not None:
+                            update_pbar(
+                                progress,
+                                pbar_eval_id,
+                                advance=count_metrics_in_span_subtree(span),
+                            )
+                        return
                     for child in span.children:
                         dfs(child, progress, pbar_eval_id)
+                    if not span.metrics:
+                        return
+                    requires_trace = any(
+                        metric.requires_trace for metric in metrics
+                    )
                     llm_test_case = None
                     if span.input is not None:
                         llm_test_case = LLMTestCase(
@@ -1613,20 +2389,29 @@ def execute_agentic_test_cases_from_loop(
                             tools_called=span.tools_called,
                             expected_tools=span.expected_tools,
                         )
-                    if span.metrics is None or llm_test_case is None:
-                        return
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in metrics
-                    )
-                    if has_task_completion:
+                    if requires_trace:
                         if llm_test_case is None:
                             llm_test_case = LLMTestCase(input="None")
                         llm_test_case._trace_dict = (
                             trace_manager.create_nested_spans_dict(span)
                         )
+                    else:
+                        if llm_test_case is None:
+                            api_span.status = TraceSpanApiStatus.ERRORED
+                            api_span.error = format_error_text(
+                                DeepEvalError(
+                                    "Span has metrics but no LLMTestCase. "
+                                    "Are you sure you called `update_current_span()`?"
+                                )
+                            )
+                            if progress and pbar_eval_id is not None:
+                                update_pbar(
+                                    progress,
+                                    pbar_eval_id,
+                                    advance=count_metrics_in_span_subtree(span),
+                                )
+                            return
                     # Preparing metric calculation
                     api_span.metrics_data = []
@@ -1670,77 +2455,123 @@ def execute_agentic_test_cases_from_loop(
                 start_time = time.perf_counter()
                 # Handle trace-level metrics
-                if current_trace.metrics:
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in current_trace.metrics
-                    )
-                    llm_test_case = None
-                    if current_trace.input:
-                        llm_test_case = LLMTestCase(
-                            input=str(current_trace.input),
-                            actual_output=(
-                                str(current_trace.output)
-                                if current_trace.output is not None
-                                else None
+                skip_metrics_for_this_golden = False
+                if _skip_metrics_for_error(trace=current_trace):
+                    trace_api.status = TraceSpanApiStatus.ERRORED
+                    if progress and pbar_eval_id is not None:
+                        update_pbar(
+                            progress,
+                            pbar_eval_id,
+                            advance=count_total_metrics_for_trace(
+                                current_trace
                             ),
-                            expected_output=current_trace.expected_output,
-                            context=current_trace.context,
-                            retrieval_context=current_trace.retrieval_context,
-                            tools_called=current_trace.tools_called,
-                            expected_tools=current_trace.expected_tools,
                         )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
+                else:
+                    if current_trace.metrics:
+                        requires_trace = any(
+                            metric.requires_trace
+                            for metric in current_trace.metrics
                         )
-                    if has_task_completion:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(
-                                current_trace.root_spans[0]
+                        llm_test_case = None
+                        if current_trace.input:
+                            llm_test_case = LLMTestCase(
+                                input=str(current_trace.input),
+                                actual_output=(
+                                    str(current_trace.output)
+                                    if current_trace.output is not None
+                                    else None
+                                ),
+                                expected_output=current_trace.expected_output,
+                                context=current_trace.context,
+                                retrieval_context=current_trace.retrieval_context,
+                                tools_called=current_trace.tools_called,
+                                expected_tools=current_trace.expected_tools,
                             )
-                        )
-                    for metric in current_trace.metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
-                    trace_api.metrics_data = []
-                    for metric in current_trace.metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        if not metric.skipped:
-                            metric_data = create_metric_data(metric)
-                            trace_api.metrics_data.append(metric_data)
-                            api_test_case.update_metric_data(metric_data)
-                            api_test_case.update_status(metric_data.success)
-                            update_pbar(progress, pbar_eval_id)
-                # Then handle span-level metrics
-                dfs(current_trace.root_spans[0], progress, pbar_eval_id)
-                end_time = time.perf_counter()
-                run_duration = end_time - start_time
-                # Update test run
-                api_test_case.update_run_duration(run_duration)
-                test_run_manager.update_test_run(api_test_case, test_case)
-                test_results.append(create_test_result(api_test_case))
+                        if requires_trace:
+                            if llm_test_case is None:
+                                llm_test_case = LLMTestCase(input="None")
+                            llm_test_case._trace_dict = (
+                                trace_manager.create_nested_spans_dict(
+                                    current_trace.root_spans[0]
+                                )
+                            )
+                        else:
+                            if llm_test_case is None:
+                                current_trace.status = TraceSpanStatus.ERRORED
+                                trace_api.status = TraceSpanApiStatus.ERRORED
+                                if current_trace.root_spans:
+                                    current_trace.root_spans[0].status = (
+                                        TraceSpanStatus.ERRORED
+                                    )
+                                    current_trace.root_spans[0].error = (
+                                        format_error_text(
+                                            DeepEvalError(
+                                                "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                "Are you sure you called `update_current_trace()`?"
+                                            )
+                                        )
+                                    )
+                                if progress and pbar_eval_id is not None:
+                                    update_pbar(
+                                        progress,
+                                        pbar_eval_id,
+                                        advance=count_total_metrics_for_trace(
+                                            current_trace
+                                        ),
+                                    )
+                                skip_metrics_for_this_golden = True
+                        if not skip_metrics_for_this_golden:
+                            for metric in current_trace.metrics:
+                                metric.skipped = False
+                                metric.error = None
+                                if display_config.verbose_mode is not None:
+                                    metric.verbose_mode = (
+                                        display_config.verbose_mode
+                                    )
+                            trace_api.metrics_data = []
+                            for metric in current_trace.metrics:
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=llm_test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=True,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
+                                if not metric.skipped:
+                                    metric_data = create_metric_data(metric)
+                                    trace_api.metrics_data.append(metric_data)
+                                    api_test_case.update_metric_data(
+                                        metric_data
+                                    )
+                                    api_test_case.update_status(
+                                        metric_data.success
+                                    )
+                                    update_pbar(progress, pbar_eval_id)
+                    # Then handle span-level metrics
+                    dfs(current_trace.root_spans[0], progress, pbar_eval_id)
+            end_time = time.perf_counter()
+            run_duration = end_time - start_time
+            # Update test run
+            api_test_case.update_run_duration(run_duration)
+            test_run_manager.update_test_run(api_test_case, test_case)
+            main_result = create_test_result(api_test_case)
+            trace_results = extract_trace_test_results(trace_api)
+            unique_trace_results = filter_duplicate_results(
+                main_result, trace_results
+            )
+            test_results.append(main_result)
+            test_results.extend(unique_trace_results)
-                update_pbar(progress, pbar_id)
+            update_pbar(progress, pbar_id)
     try:
         if display_config.show_indicator and _use_bar_indicator:
@@ -1798,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
     async def execute_callback_with_semaphore(coroutine: Awaitable):
         async with semaphore:
-            return await asyncio.wait_for(
-                coroutine, timeout=_per_task_timeout()
-            )
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(coroutine, timeout=timeout)
     def evaluate_test_cases(
         progress: Optional[Progress] = None,
@@ -1841,39 +2671,146 @@ def a_execute_agentic_test_cases_from_loop(
             }
             def on_task_done(t: asyncio.Task):
+                cancelled = False
+                exc = None
+                trace = None
+                root = None
+                resolved_trace_from_task = False
+                resolved_root_from_task = False
+                # Task.exception() raises CancelledError if task was cancelled
+                try:
+                    exc = t.exception()
+                except asyncio.CancelledError:
+                    cancelled = True
+                    exc = None
+                meta = task_meta.get(t, {})
+                golden_index = meta.get("golden_index")
+                if golden_index is not None and 0 <= golden_index < len(
+                    goldens
+                ):
+                    golden = goldens[golden_index]
+                    def _mark_trace_error(trace, root, msg: str):
+                        now = time.perf_counter()
+                        trace.status = TraceSpanStatus.ERRORED
+                        # Close the trace so the API layer has a proper endTime
+                        if trace.end_time is None:
+                            trace.end_time = now
+                        if root:
+                            root.status = TraceSpanStatus.ERRORED
+                            root.error = msg
+                            if root.end_time is None:
+                                root.end_time = now
+                    if exc is not None:
+                        msg = format_error_text(exc)
+                        trace, root = _resolve_trace_and_root_for_task(t)
+                        resolved_trace_from_task = bool(trace)
+                        resolved_root_from_task = bool(root)
+                        if trace:
+                            _mark_trace_error(trace, root, msg)
+                        else:
+                            for (
+                                trace
+                            ) in trace_manager.integration_traces_to_evaluate:
+                                if (
+                                    trace_manager.trace_uuid_to_golden.get(
+                                        trace.uuid
+                                    )
+                                    is golden
+                                ):
+                                    root = _pick_root_for_marking(trace)
+                                    _mark_trace_error(trace, root, msg)
+                                    break
+                    elif cancelled or t.cancelled():
+                        cancel_exc = DeepEvalError(
+                            "Task was cancelled (likely due to timeout)."
+                        )
+                        msg = format_error_text(cancel_exc)
+                        trace, root = _resolve_trace_and_root_for_task(t)
+                        resolved_trace_from_task = bool(trace)
+                        resolved_root_from_task = bool(root)
+                        if trace:
+                            _mark_trace_error(trace, root, msg)
+                        else:
+                            for (
+                                trace
+                            ) in trace_manager.integration_traces_to_evaluate:
+                                if (
+                                    trace_manager.trace_uuid_to_golden.get(
+                                        trace.uuid
+                                    )
+                                    is golden
+                                ):
+                                    root = _pick_root_for_marking(trace)
+                                    _mark_trace_error(trace, root, msg)
+                                    break
                 if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     # Using info level here to make it easy to spot these logs.
-                    # We are gated by DEEPEVAL_DEBUG_ASYNC
-                    meta = task_meta.get(t, {})
+                    golden_name = meta.get("golden_name")
                     duration = time.perf_counter() - meta.get(
                         "started", started
                     )
-                    if t.cancelled():
+                    if cancelled or exc is not None:
+                        if not resolved_trace_from_task:
+                            logger.warning(
+                                "[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
+                                t.get_name(),
+                                golden_name,
+                            )
+                        elif not resolved_root_from_task:
+                            logger.warning(
+                                "[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
+                                t.get_name(),
+                                trace.uuid,
+                            )
+                    if cancelled:
                         logger.info(
                             "[deepeval] task CANCELLED %s after %.2fs meta=%r",
                             t.get_name(),
                             duration,
                             meta,
                         )
-                    else:
-                        exc = t.exception()
-                        if exc is not None:
-                            logger.error(
-                                "[deepeval] task ERROR %s after %.2fs meta=%r",
-                                t.get_name(),
-                                duration,
-                                meta,
-                                exc_info=(type(exc), exc, exc.__traceback__),
-                            )
-                        else:
-                            logger.info(
-                                "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
-                                t.get_name(),
-                                duration,
-                                meta.get("golden_index"),
+                    elif exc is not None:
+                        show_trace = bool(
+                            get_settings().DEEPEVAL_LOG_STACK_TRACES
+                        )
+                        exc_info = (
+                            (
+                                type(exc),
+                                exc,
+                                getattr(exc, "__traceback__", None),
                             )
+                            if show_trace
+                            else None
+                        )
+                        logger.error(
+                            "[deepeval] task ERROR %s after %.2fs meta=%r",
+                            t.get_name(),
+                            duration,
+                            meta,
+                            exc_info=exc_info,
+                        )
+                    else:
+                        logger.info(
+                            "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
+                            t.get_name(),
+                            duration,
+                            meta.get("golden_index"),
+                        )
+                try:
+                    trace_manager.task_bindings.pop(t, None)
+                except Exception:
+                    pass
                 update_pbar(progress, pbar_callback_id)
                 update_pbar(progress, pbar_id)
@@ -1918,7 +2855,8 @@ def a_execute_agentic_test_cases_from_loop(
                         timeout=_gather_timeout(),
                     )
                 )
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 import traceback
                 pending = [t for t in created_tasks if not t.done()]
@@ -1987,10 +2925,11 @@ def a_execute_agentic_test_cases_from_loop(
                 ]
                 if get_settings().DEEPEVAL_DEBUG_ASYNC:
-                    logger.warning(
-                        "[deepeval] %d stray task(s) not tracked; cancelling...",
-                        len(leftovers),
-                    )
+                    if len(leftovers) > 0:
+                        logger.warning(
+                            "[deepeval] %d stray task(s) not tracked; cancelling...",
+                            len(leftovers),
+                        )
                     for t in leftovers:
                         meta = task_meta.get(t, {})
                         name = t.get_name()
@@ -2130,7 +3069,10 @@ async def _a_evaluate_traces(
     async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
+            )
     eval_tasks = []
     # Here, we will work off a fixed-set copy to avoid surprises from potential
@@ -2173,7 +3115,18 @@ async def _a_evaluate_traces(
             )
             eval_tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
-    await asyncio.gather(*eval_tasks)
+    try:
+        await asyncio.wait_for(
+            asyncio.gather(*eval_tasks),
+            timeout=_gather_timeout(),
+        )
+    except (asyncio.TimeoutError, TimeoutError):
+        for t in eval_tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*eval_tasks, return_exceptions=True)
+        raise
 async def _evaluate_test_case_pairs(
@@ -2196,7 +3149,10 @@ async def _evaluate_test_case_pairs(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            timeout = _per_task_timeout()
+            return await _await_with_outer_deadline(
+                func, *args, timeout=timeout, **kwargs
+            )
     tasks = []
     for count, test_case_pair in enumerate(test_case_pairs):
@@ -2229,7 +3185,19 @@ async def _evaluate_test_case_pairs(
             )
             tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
-    await asyncio.gather(*tasks)
+    try:
+        await asyncio.wait_for(
+            asyncio.gather(*tasks),
+            timeout=_gather_timeout(),
+        )
+    except (asyncio.TimeoutError, TimeoutError):
+        # Cancel any still-pending tasks and drain them
+        for t in tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        raise
 def _execute_metric(
@@ -2248,10 +3216,13 @@ def _execute_metric(
         )
     except MissingTestCaseParamsError as e:
         if error_config.skip_on_missing_params:
+            metric.skipped = True
+            metric.error = None
+            metric.success = None
             return "skip"
         else:
             if error_config.ignore_errors:
-                metric.error = str(e)
+                metric.error = format_error_text(e)
                 metric.success = False
             else:
                 raise
@@ -2260,22 +3231,25 @@ def _execute_metric(
             metric.measure(test_case)
         except MissingTestCaseParamsError as e:
             if error_config.skip_on_missing_params:
+                metric.skipped = True
+                metric.error = None
+                metric.success = None
                 return "skip"
             else:
                 if error_config.ignore_errors:
-                    metric.error = str(e)
+                    metric.error = format_error_text(e)
                     metric.success = False
                 else:
                     raise
         except Exception as e:
             if error_config.ignore_errors:
-                metric.error = str(e)
+                metric.error = format_error_text(e)
                 metric.success = False
             else:
                 raise
     except Exception as e:
         if error_config.ignore_errors:
-            metric.error = str(e)
+            metric.error = format_error_text(e)
             metric.success = False
         else:
             raise

deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

deepeval 3.6.7py3-none-any.whl → 3.6.9py3-none-any.whl