PyPI - deepeval - Versions diffs - 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl - Mend

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/settings.py +22 -4
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/errors.py +20 -2
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +811 -248
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +33 -119
deepeval/integrations/crewai/__init__.py +7 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +11 -0
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +20 -9
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +20 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +4 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +85 -50
deepeval/openai/patch.py +258 -167
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +205 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +245 -62
deepeval/prompt/utils.py +186 -15
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +19 -15
deepeval/test_case/api.py +131 -0
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +292 -206
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +24 -5
deepeval/tracing/trace_context.py +89 -5
deepeval/tracing/tracing.py +74 -3
deepeval/tracing/types.py +20 -2
deepeval/tracing/utils.py +8 -0
deepeval/utils.py +21 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
deepeval/integrations/llama_index/agent/patched.py +0 -68
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0

deepeval/evaluate/execute.py CHANGED Viewed

@@ -43,15 +43,19 @@ from deepeval.tracing.api import (
 )
 from deepeval.dataset import Golden
 from deepeval.contextvars import set_current_golden, reset_current_golden
-from deepeval.errors import MissingTestCaseParamsError
+from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
 from deepeval.metrics.utils import copy_metrics
-from deepeval.utils import get_or_create_event_loop, shorten, len_medium
+from deepeval.utils import (
+    get_or_create_event_loop,
+    shorten,
+    len_medium,
+    format_error_text,
+)
 from deepeval.telemetry import capture_evaluation_run
 from deepeval.metrics import (
     BaseMetric,
     BaseConversationalMetric,
     BaseMultimodalMetric,
-    TaskCompletionMetric,
 )
 from deepeval.metrics.indicator import (
     measure_metrics_with_indicator,
@@ -61,6 +65,7 @@ from deepeval.test_case import (
     ConversationalTestCase,
     MLLMTestCase,
 )
+from deepeval.test_case.api import create_api_test_case
 from deepeval.test_run import (
     global_test_run_manager,
     LLMApiTestCase,
@@ -80,19 +85,127 @@ from deepeval.evaluate.utils import (
     create_api_trace,
     create_metric_data,
     create_test_result,
-    create_api_test_case,
     count_metrics_in_trace,
+    count_total_metrics_for_trace,
+    count_metrics_in_span_subtree,
     extract_trace_test_results,
 )
 from deepeval.utils import add_pbar, update_pbar, custom_console
-from deepeval.openai.utils import openai_test_case_pairs
-from deepeval.tracing.types import TestCaseMetricPair
+from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
+from deepeval.tracing.api import TraceSpanApiStatus
 from deepeval.config.settings import get_settings
+from deepeval.test_run import TEMP_FILE_PATH
+from deepeval.confident.api import is_confident
+from deepeval.test_run.hyperparameters import (
+    process_hyperparameters,
+    process_prompts,
+)
 logger = logging.getLogger(__name__)
+def _skip_metrics_for_error(
+    span: Optional[BaseSpan] = None,
+    trace: Optional[Trace] = None,
+) -> bool:
+    # trace failure: skip everything under this trace
+    if trace is not None and trace.status == TraceSpanStatus.ERRORED:
+        return True
+    # span failure: skip this span’s metrics
+    if span is not None and span.status == TraceSpanStatus.ERRORED:
+        return True
+    return False
+def _trace_error(current_trace: Trace) -> Optional[str]:
+    def _first_err(s: BaseSpan) -> Optional[str]:
+        if s.status == TraceSpanStatus.ERRORED and s.error:
+            return s.error
+        for c in s.children or []:
+            e = _first_err(c)
+            if e:
+                return e
+        return None
+    for root in current_trace.root_spans or []:
+        e = _first_err(root)
+        if e:
+            return e
+    return None
+def _get_trace_by_uuid_anywhere(trace_uuid: str):
+    """
+    Resolver for a trace UUID across the manager's state.
+    First tries the manager's indexed lookup, which (covers active/in-flight traces,
+    then does a linear scan of the full `trace_manager.traces` list, which covers
+    traces that were recorded/closed earlier or not yet indexed. Returns
+    the concrete Trace object or None if not found.
+    """
+    tr = trace_manager.get_trace_by_uuid(trace_uuid)
+    if tr:
+        return tr
+    for tr in trace_manager.traces:
+        if tr.uuid == trace_uuid:
+            return tr
+    return None
+def _pick_root_for_marking(trace):
+    """
+    Choose the most appropriate root span to annotate on error/cancel.
+    Heuristic:
+      - Prefer the most recent open root, which will have no `end_time` since this is the
+        span currently in flight.
+      - If none are open, use the last root span if it exists.
+      - If the trace has no roots, return None.
+    This favors marking the active root in multi root traces while remaining
+    stable for already closed traces.
+    """
+    open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
+    return (
+        open_roots[-1]
+        if open_roots
+        else (trace.root_spans[-1] if trace.root_spans else None)
+    )
+def _resolve_trace_and_root_for_task(t: asyncio.Task):
+    """
+    Resolve trace and root for a completed task using the weak binding map.
+    Steps:
+      1. Look up the task in `trace_manager.task_bindings` to get the
+         bound `trace_uuid` and, if available, `root_span_uuid`.
+      2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
+      3. If a bound root UUID exists, try to find that exact root on the trace.
+      4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
+    Returns a trace / root tuple. Either may be `None` when no binding is
+    present. This function is used by `on_task_done` to robustly mark error/cancel
+    states without assuming a single root trace or a root that is still open.
+    """
+    binding = trace_manager.task_bindings.get(t) or {}
+    trace_uuid = binding.get("trace_uuid")
+    root_span_uuid = binding.get("root_span_uuid")
+    trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
+    root = None
+    if trace and root_span_uuid:
+        root = next(
+            (rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
+        )
+    if trace and root is None:
+        root = _pick_root_for_marking(trace)
+    return trace, root
 async def _snapshot_tasks():
     cur = asyncio.current_task()
     # `all_tasks` returns tasks for the current running loop only
@@ -111,6 +224,20 @@ def _gather_timeout() -> float:
     )
+def filter_duplicate_results(
+    main_result: TestResult, results: List[TestResult]
+) -> List[TestResult]:
+    return [
+        result
+        for result in results
+        if not (
+            (result.input == main_result.input)
+            and (result.actual_output == main_result.actual_output)
+            and (result.metrics_data == main_result.metrics_data)
+        )
+    ]
 ###########################################
 ### E2E Evals #############################
 ###########################################
@@ -376,7 +503,10 @@ async def a_execute_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            return await asyncio.wait_for(
+                func(*args, **kwargs),
+                timeout=_per_task_timeout(),
+            )
     global_test_run_cache_manager.disable_write_cache = (
         cache_config.write_cache is False
@@ -495,7 +625,20 @@ async def a_execute_test_cases(
                         tasks.append(asyncio.create_task(task))
                     await asyncio.sleep(async_config.throttle_value)
-            await asyncio.gather(*tasks)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*tasks),
+                    timeout=_gather_timeout(),
+                )
+            except asyncio.TimeoutError:
+                # Cancel any still-pending tasks and drain them
+                for t in tasks:
+                    if not t.done():
+                        t.cancel()
+                await asyncio.gather(*tasks, return_exceptions=True)
+                raise
     else:
         for test_case in test_cases:
             with capture_evaluation_run("test case"):
@@ -568,7 +711,19 @@ async def a_execute_test_cases(
                     tasks.append(asyncio.create_task(task))
                 await asyncio.sleep(async_config.throttle_value)
-        await asyncio.gather(*tasks)
+        try:
+            await asyncio.wait_for(
+                asyncio.gather(*tasks),
+                timeout=_gather_timeout(),
+            )
+        except asyncio.TimeoutError:
+            # Cancel any still-pending tasks and drain them
+            for t in tasks:
+                if not t.done():
+                    t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+            raise
     return test_results
@@ -843,6 +998,7 @@ def execute_agentic_test_cases(
                     _progress=progress,
                     _pbar_callback_id=pbar_tags_id,
                 ):
                     if asyncio.iscoroutinefunction(observed_callback):
                         loop = get_or_create_event_loop()
                         coro = observed_callback(golden.input)
@@ -894,14 +1050,16 @@ def execute_agentic_test_cases(
                     pbar_eval_id: Optional[int] = None,
                 ):
                     # Create API Span
-                    metrics: List[BaseMetric] = span.metrics
+                    metrics: List[BaseMetric] = list(span.metrics or [])
                     api_span: BaseApiSpan = (
                         trace_manager._convert_span_to_api_span(span)
                     )
                     if isinstance(span, AgentSpan):
                         trace_api.agent_spans.append(api_span)
                     elif isinstance(span, LlmSpan):
                         trace_api.llm_spans.append(api_span)
+                        log_prompt(span, test_run_manager)
                     elif isinstance(span, RetrieverSpan):
                         trace_api.retriever_spans.append(api_span)
                     elif isinstance(span, ToolSpan):
@@ -909,14 +1067,27 @@ def execute_agentic_test_cases(
                     else:
                         trace_api.base_spans.append(api_span)
+                    # Skip errored trace/span
+                    if _skip_metrics_for_error(span=span, trace=current_trace):
+                        api_span.status = TraceSpanApiStatus.ERRORED
+                        api_span.error = span.error or _trace_error(
+                            current_trace
+                        )
+                        if progress and pbar_eval_id is not None:
+                            update_pbar(
+                                progress,
+                                pbar_eval_id,
+                                advance=count_metrics_in_span_subtree(span),
+                            )
+                        return
                     for child in span.children:
                         dfs(child, progress, pbar_eval_id)
-                    if span.metrics is None:
+                    if not span.metrics:
                         return
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in span.metrics
+                    requires_trace = any(
+                        metric.requires_trace for metric in span.metrics
                     )
                     llm_test_case = None
@@ -934,18 +1105,30 @@ def execute_agentic_test_cases(
                             tools_called=span.tools_called,
                             expected_tools=span.expected_tools,
                         )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
-                        )
                     # add trace if task completion
-                    if has_task_completion:
+                    if requires_trace:
                         if llm_test_case is None:
                             llm_test_case = LLMTestCase(input="None")
                         llm_test_case._trace_dict = (
                             trace_manager.create_nested_spans_dict(span)
                         )
+                    else:
+                        if llm_test_case is None:
+                            api_span.status = TraceSpanApiStatus.ERRORED
+                            api_span.error = format_error_text(
+                                DeepEvalError(
+                                    "Span has metrics but no LLMTestCase. "
+                                    "Are you sure you called `update_current_span()`?"
+                                )
+                            )
+                            if progress and pbar_eval_id is not None:
+                                update_pbar(
+                                    progress,
+                                    pbar_eval_id,
+                                    advance=count_metrics_in_span_subtree(span),
+                                )
+                            return
                     # Preparing metric calculation
                     api_span.metrics_data = []
@@ -984,72 +1167,111 @@ def execute_agentic_test_cases(
                 start_time = time.perf_counter()
+                skip_metrics_for_this_golden = False
                 # Handle trace-level metrics
-                if current_trace.metrics:
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in current_trace.metrics
-                    )
-                    llm_test_case = None
-                    if current_trace.input:
-                        llm_test_case = LLMTestCase(
-                            input=str(current_trace.input),
-                            actual_output=(
-                                str(current_trace.output)
-                                if current_trace.output is not None
-                                else None
+                if _skip_metrics_for_error(trace=current_trace):
+                    trace_api.status = TraceSpanApiStatus.ERRORED
+                    if progress and pbar_eval_id is not None:
+                        update_pbar(
+                            progress,
+                            pbar_eval_id,
+                            advance=count_total_metrics_for_trace(
+                                current_trace
                             ),
-                            expected_output=current_trace.expected_output,
-                            context=current_trace.context,
-                            retrieval_context=current_trace.retrieval_context,
-                            tools_called=current_trace.tools_called,
-                            expected_tools=current_trace.expected_tools,
                         )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
+                else:
+                    if current_trace.metrics:
+                        requires_trace = any(
+                            metric.requires_trace
+                            for metric in current_trace.metrics
                         )
-                    if has_task_completion:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(
-                                current_trace.root_spans[0]
+                        llm_test_case = None
+                        if current_trace.input:
+                            llm_test_case = LLMTestCase(
+                                input=str(current_trace.input),
+                                actual_output=(
+                                    str(current_trace.output)
+                                    if current_trace.output is not None
+                                    else None
+                                ),
+                                expected_output=current_trace.expected_output,
+                                context=current_trace.context,
+                                retrieval_context=current_trace.retrieval_context,
+                                tools_called=current_trace.tools_called,
+                                expected_tools=current_trace.expected_tools,
                             )
-                        )
-                    for metric in current_trace.metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
-                    trace_api.metrics_data = []
-                    for metric in current_trace.metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        if not metric.skipped:
-                            metric_data = create_metric_data(metric)
-                            trace_api.metrics_data.append(metric_data)
-                            api_test_case.update_metric_data(metric_data)
-                            api_test_case.update_status(metric_data.success)
-                            update_pbar(progress, pbar_eval_id)
+                        if requires_trace:
+                            if llm_test_case is None:
+                                llm_test_case = LLMTestCase(input="None")
+                            llm_test_case._trace_dict = (
+                                trace_manager.create_nested_spans_dict(
+                                    current_trace.root_spans[0]
+                                )
+                            )
+                        else:
+                            if llm_test_case is None:
+                                current_trace.status = TraceSpanStatus.ERRORED
+                                trace_api.status = TraceSpanApiStatus.ERRORED
+                                if current_trace.root_spans:
+                                    current_trace.root_spans[0].status = (
+                                        TraceSpanStatus.ERRORED
+                                    )
+                                    current_trace.root_spans[0].error = (
+                                        format_error_text(
+                                            DeepEvalError(
+                                                "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                "Are you sure you called `update_current_trace()`?"
+                                            )
+                                        )
+                                    )
+                                if progress and pbar_eval_id is not None:
+                                    update_pbar(
+                                        progress,
+                                        pbar_eval_id,
+                                        advance=count_total_metrics_for_trace(
+                                            current_trace
+                                        ),
+                                    )
+                                skip_metrics_for_this_golden = True
+                        if not skip_metrics_for_this_golden:
+                            for metric in current_trace.metrics:
+                                metric.skipped = False
+                                metric.error = None
+                                if display_config.verbose_mode is not None:
+                                    metric.verbose_mode = (
+                                        display_config.verbose_mode
+                                    )
+                            trace_api.metrics_data = []
+                            for metric in current_trace.metrics:
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=llm_test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=True,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
+                                if not metric.skipped:
+                                    metric_data = create_metric_data(metric)
+                                    trace_api.metrics_data.append(metric_data)
+                                    api_test_case.update_metric_data(
+                                        metric_data
+                                    )
+                                    api_test_case.update_status(
+                                        metric_data.success
+                                    )
+                                    update_pbar(progress, pbar_eval_id)
+                        # Then handle span-level metrics
+                        dfs(current_trace.root_spans[0], progress, pbar_eval_id)
-                # Then handle span-level metrics
-                dfs(current_trace.root_spans[0], progress, pbar_eval_id)
                 end_time = time.perf_counter()
                 run_duration = end_time - start_time
                 # Update test run
                 api_test_case.update_run_duration(run_duration)
                 test_run_manager.update_test_run(api_test_case, test_case)
@@ -1097,7 +1319,10 @@ async def a_execute_agentic_test_cases(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            return await asyncio.wait_for(
+                func(*args, **kwargs),
+                timeout=_per_task_timeout(),
+            )
     test_run_manager = global_test_run_manager
     test_run_manager.save_to_disk = cache_config.write_cache
@@ -1144,7 +1369,19 @@ async def a_execute_agentic_test_cases(
                     tasks.append(asyncio.create_task(task))
                     await asyncio.sleep(async_config.throttle_value)
-            await asyncio.gather(*tasks)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*tasks),
+                    timeout=_gather_timeout(),
+                )
+            except asyncio.TimeoutError:
+                # Cancel any still-pending tasks and drain them
+                for t in tasks:
+                    if not t.done():
+                        t.cancel()
+                await asyncio.gather(*tasks, return_exceptions=True)
+                raise
     else:
         for golden in goldens:
             with capture_evaluation_run("golden"):
@@ -1261,7 +1498,7 @@ async def _a_execute_agentic_test_case(
     )
     await _a_execute_trace_test_case(
-        trace=trace,
+        trace=current_trace,
         trace_api=trace_api,
         api_test_case=api_test_case,
         ignore_errors=ignore_errors,
@@ -1273,9 +1510,10 @@ async def _a_execute_agentic_test_case(
         _use_bar_indicator=_use_bar_indicator,
     )
-    async def dfs(span: BaseSpan):
+    async def dfs(trace: Trace, span: BaseSpan):
         await _a_execute_span_test_case(
             span=span,
+            current_trace=trace,
             trace_api=trace_api,
             api_test_case=api_test_case,
             ignore_errors=ignore_errors,
@@ -1284,27 +1522,61 @@ async def _a_execute_agentic_test_case(
             verbose_mode=verbose_mode,
             progress=progress,
             pbar_eval_id=pbar_eval_id,
+            test_run_manager=test_run_manager,
             _use_bar_indicator=_use_bar_indicator,
         )
-        child_tasks = [dfs(child) for child in span.children]
+        if _skip_metrics_for_error(span=span, trace=trace):
+            return
+        child_tasks = [
+            asyncio.create_task(dfs(trace, child)) for child in span.children
+        ]
         if child_tasks:
-            await asyncio.gather(*child_tasks)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*child_tasks),
+                    timeout=_gather_timeout(),
+                )
+            except asyncio.TimeoutError:
+                for t in child_tasks:
+                    if not t.done():
+                        t.cancel()
+                await asyncio.gather(*child_tasks, return_exceptions=True)
+                raise
     test_start_time = time.perf_counter()
-    await dfs(current_trace.root_spans[0])
+    if not _skip_metrics_for_error(trace=current_trace):
+        if current_trace and current_trace.root_spans:
+            await dfs(current_trace, current_trace.root_spans[0])
+        else:
+            if (
+                logger.isEnabledFor(logging.DEBUG)
+                and get_settings().DEEPEVAL_VERBOSE_MODE
+            ):
+                logger.debug(
+                    "Skipping DFS: empty trace or no root spans (trace=%s)",
+                    current_trace.uuid if current_trace else None,
+                )
     test_end_time = time.perf_counter()
     run_duration = test_end_time - test_start_time
     api_test_case.update_run_duration(run_duration)
     test_run_manager.update_test_run(api_test_case, test_case)
-    test_results.append(create_test_result(api_test_case))
-    test_results.extend(extract_trace_test_results(trace_api))
+    main_result = create_test_result(api_test_case)
+    trace_results = extract_trace_test_results(trace_api)
+    unique_trace_results = filter_duplicate_results(main_result, trace_results)
+    test_results.append(main_result)
+    test_results.extend(unique_trace_results)
     update_pbar(progress, pbar_id)
 async def _a_execute_span_test_case(
     span: BaseSpan,
+    current_trace: Trace,
     trace_api: TraceApi,
     api_test_case: LLMApiTestCase,
     ignore_errors: bool,
@@ -1313,6 +1585,7 @@ async def _a_execute_span_test_case(
     verbose_mode: Optional[bool],
     progress: Optional[Progress],
     pbar_eval_id: Optional[int],
+    test_run_manager: Optional[TestRunManager],
     _use_bar_indicator: bool,
 ):
     api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
@@ -1320,6 +1593,7 @@ async def _a_execute_span_test_case(
         trace_api.agent_spans.append(api_span)
     elif isinstance(span, LlmSpan):
         trace_api.llm_spans.append(api_span)
+        log_prompt(span, test_run_manager)
     elif isinstance(span, RetrieverSpan):
         trace_api.retriever_spans.append(api_span)
     elif isinstance(span, ToolSpan):
@@ -1327,12 +1601,22 @@ async def _a_execute_span_test_case(
     else:
         trace_api.base_spans.append(api_span)
-    if span.metrics is None:
+    if _skip_metrics_for_error(span=span, trace=current_trace):
+        api_span.status = TraceSpanApiStatus.ERRORED
+        api_span.error = span.error or _trace_error(current_trace)
+        if progress and pbar_eval_id is not None:
+            update_pbar(
+                progress,
+                pbar_eval_id,
+                advance=count_metrics_in_span_subtree(span),
+            )
         return
-    has_task_completion = any(
-        isinstance(metric, TaskCompletionMetric) for metric in span.metrics
-    )
+    metrics: List[BaseMetric] = list(span.metrics or [])
+    if not metrics:
+        return
+    requires_trace = any(metric.requires_trace for metric in metrics)
     llm_test_case = None
     if span.input:
@@ -1345,17 +1629,29 @@ async def _a_execute_span_test_case(
             tools_called=span.tools_called,
             expected_tools=span.expected_tools,
         )
-    if llm_test_case is None and not has_task_completion:
-        raise ValueError(
-            "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
-        )
+    if not requires_trace:
+        if llm_test_case is None:
+            api_span.status = TraceSpanApiStatus.ERRORED
+            api_span.error = format_error_text(
+                DeepEvalError(
+                    "Span has metrics but no LLMTestCase. "
+                    "Are you sure you called `update_current_span()`?"
+                )
+            )
+            if progress and pbar_eval_id is not None:
+                update_pbar(
+                    progress,
+                    pbar_eval_id,
+                    advance=count_metrics_in_span_subtree(span),
+                )
+            return
     show_metrics_indicator = show_indicator and not _use_bar_indicator
-    metrics: List[BaseMetric] = span.metrics
     test_case: Optional[LLMTestCase] = llm_test_case
     # add trace if task completion
-    if has_task_completion:
+    if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
         test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
@@ -1399,12 +1695,22 @@ async def _a_execute_trace_test_case(
     pbar_eval_id: Optional[int],
     _use_bar_indicator: bool,
 ):
-    if trace.metrics is None:
+    if _skip_metrics_for_error(trace=trace):
+        trace_api.status = TraceSpanApiStatus.ERRORED
+        if progress and pbar_eval_id is not None:
+            update_pbar(
+                progress,
+                pbar_eval_id,
+                advance=count_total_metrics_for_trace(trace),
+            )
         return
-    has_task_completion = any(
-        isinstance(metric, TaskCompletionMetric) for metric in trace.metrics
-    )
+    metrics: List[BaseMetric] = list(trace.metrics or [])
+    if not metrics:
+        return
+    requires_trace = any(metric.requires_trace for metric in metrics)
     llm_test_case = None
     if trace.input:
@@ -1419,17 +1725,32 @@ async def _a_execute_trace_test_case(
             tools_called=trace.tools_called,
             expected_tools=trace.expected_tools,
         )
-    if llm_test_case is None and not has_task_completion:
-        raise ValueError(
-            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
-        )
+    if not requires_trace:
+        if llm_test_case is None:
+            trace.status = TraceSpanStatus.ERRORED
+            trace_api.status = TraceSpanApiStatus.ERRORED
+            if trace.root_spans:
+                trace.root_spans[0].status = TraceSpanStatus.ERRORED
+                trace.root_spans[0].error = format_error_text(
+                    DeepEvalError(
+                        "Trace has metrics but no LLMTestCase (missing input/output). "
+                        "Are you sure you called `update_current_trace()`?"
+                    )
+                )
+            if progress and pbar_eval_id is not None:
+                update_pbar(
+                    progress,
+                    pbar_eval_id,
+                    advance=count_total_metrics_for_trace(trace),
+                )
+            return
     show_metrics_indicator = show_indicator and not _use_bar_indicator
-    metrics: List[BaseMetric] = trace.metrics
     test_case: Optional[LLMTestCase] = llm_test_case
     # add trace if task completion
-    if has_task_completion:
+    if requires_trace:
         if test_case is None:
             test_case = LLMTestCase(input="None")
         test_case._trace_dict = trace_manager.create_nested_spans_dict(
@@ -1559,15 +1880,17 @@ def execute_agentic_test_cases_from_loop(
                     pbar_eval_id: Optional[int] = None,
                 ):
                     # Create API Span
-                    metrics: List[BaseMetric] = span.metrics
+                    metrics: List[BaseMetric] = list(span.metrics or [])
                     api_span: BaseApiSpan = (
                         trace_manager._convert_span_to_api_span(span)
                     )
                     if isinstance(span, AgentSpan):
                         trace_api.agent_spans.append(api_span)
                     elif isinstance(span, LlmSpan):
                         trace_api.llm_spans.append(api_span)
+                        log_prompt(span, test_run_manager)
                     elif isinstance(span, RetrieverSpan):
                         trace_api.retriever_spans.append(api_span)
                     elif isinstance(span, ToolSpan):
@@ -1575,9 +1898,30 @@ def execute_agentic_test_cases_from_loop(
                     else:
                         trace_api.base_spans.append(api_span)
+                    # Skip errored trace/span
+                    if _skip_metrics_for_error(span=span, trace=current_trace):
+                        api_span.status = TraceSpanApiStatus.ERRORED
+                        api_span.error = span.error or _trace_error(
+                            current_trace
+                        )
+                        if progress and pbar_eval_id is not None:
+                            update_pbar(
+                                progress,
+                                pbar_eval_id,
+                                advance=count_metrics_in_span_subtree(span),
+                            )
+                        return
                     for child in span.children:
                         dfs(child, progress, pbar_eval_id)
+                    if not span.metrics:
+                        return
+                    requires_trace = any(
+                        metric.requires_trace for metric in metrics
+                    )
                     llm_test_case = None
                     if span.input is not None:
                         llm_test_case = LLMTestCase(
@@ -1593,20 +1937,29 @@ def execute_agentic_test_cases_from_loop(
                             tools_called=span.tools_called,
                             expected_tools=span.expected_tools,
                         )
-                    if span.metrics is None or llm_test_case is None:
-                        return
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in metrics
-                    )
-                    if has_task_completion:
+                    if requires_trace:
                         if llm_test_case is None:
                             llm_test_case = LLMTestCase(input="None")
                         llm_test_case._trace_dict = (
                             trace_manager.create_nested_spans_dict(span)
                         )
+                    else:
+                        if llm_test_case is None:
+                            api_span.status = TraceSpanApiStatus.ERRORED
+                            api_span.error = format_error_text(
+                                DeepEvalError(
+                                    "Span has metrics but no LLMTestCase. "
+                                    "Are you sure you called `update_current_span()`?"
+                                )
+                            )
+                            if progress and pbar_eval_id is not None:
+                                update_pbar(
+                                    progress,
+                                    pbar_eval_id,
+                                    advance=count_metrics_in_span_subtree(span),
+                                )
+                            return
                     # Preparing metric calculation
                     api_span.metrics_data = []
@@ -1650,77 +2003,123 @@ def execute_agentic_test_cases_from_loop(
                 start_time = time.perf_counter()
                 # Handle trace-level metrics
-                if current_trace.metrics:
-                    has_task_completion = any(
-                        isinstance(metric, TaskCompletionMetric)
-                        for metric in current_trace.metrics
-                    )
-                    llm_test_case = None
-                    if current_trace.input:
-                        llm_test_case = LLMTestCase(
-                            input=str(current_trace.input),
-                            actual_output=(
-                                str(current_trace.output)
-                                if current_trace.output is not None
-                                else None
+                skip_metrics_for_this_golden = False
+                if _skip_metrics_for_error(trace=current_trace):
+                    trace_api.status = TraceSpanApiStatus.ERRORED
+                    if progress and pbar_eval_id is not None:
+                        update_pbar(
+                            progress,
+                            pbar_eval_id,
+                            advance=count_total_metrics_for_trace(
+                                current_trace
                             ),
-                            expected_output=current_trace.expected_output,
-                            context=current_trace.context,
-                            retrieval_context=current_trace.retrieval_context,
-                            tools_called=current_trace.tools_called,
-                            expected_tools=current_trace.expected_tools,
                         )
-                    if llm_test_case is None and not has_task_completion:
-                        raise ValueError(
-                            "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
+                else:
+                    if current_trace.metrics:
+                        requires_trace = any(
+                            metric.requires_trace
+                            for metric in current_trace.metrics
                         )
-                    if has_task_completion:
-                        if llm_test_case is None:
-                            llm_test_case = LLMTestCase(input="None")
-                        llm_test_case._trace_dict = (
-                            trace_manager.create_nested_spans_dict(
-                                current_trace.root_spans[0]
+                        llm_test_case = None
+                        if current_trace.input:
+                            llm_test_case = LLMTestCase(
+                                input=str(current_trace.input),
+                                actual_output=(
+                                    str(current_trace.output)
+                                    if current_trace.output is not None
+                                    else None
+                                ),
+                                expected_output=current_trace.expected_output,
+                                context=current_trace.context,
+                                retrieval_context=current_trace.retrieval_context,
+                                tools_called=current_trace.tools_called,
+                                expected_tools=current_trace.expected_tools,
                             )
-                        )
-                    for metric in current_trace.metrics:
-                        metric.skipped = False
-                        metric.error = None
-                        if display_config.verbose_mode is not None:
-                            metric.verbose_mode = display_config.verbose_mode
-                    trace_api.metrics_data = []
-                    for metric in current_trace.metrics:
-                        res = _execute_metric(
-                            metric=metric,
-                            test_case=llm_test_case,
-                            show_metric_indicator=show_metric_indicator,
-                            in_component=True,
-                            error_config=error_config,
-                        )
-                        if res == "skip":
-                            continue
-                        if not metric.skipped:
-                            metric_data = create_metric_data(metric)
-                            trace_api.metrics_data.append(metric_data)
-                            api_test_case.update_metric_data(metric_data)
-                            api_test_case.update_status(metric_data.success)
-                            update_pbar(progress, pbar_eval_id)
-                # Then handle span-level metrics
-                dfs(current_trace.root_spans[0], progress, pbar_eval_id)
-                end_time = time.perf_counter()
-                run_duration = end_time - start_time
-                # Update test run
-                api_test_case.update_run_duration(run_duration)
-                test_run_manager.update_test_run(api_test_case, test_case)
-                test_results.append(create_test_result(api_test_case))
+                        if requires_trace:
+                            if llm_test_case is None:
+                                llm_test_case = LLMTestCase(input="None")
+                            llm_test_case._trace_dict = (
+                                trace_manager.create_nested_spans_dict(
+                                    current_trace.root_spans[0]
+                                )
+                            )
+                        else:
+                            if llm_test_case is None:
+                                current_trace.status = TraceSpanStatus.ERRORED
+                                trace_api.status = TraceSpanApiStatus.ERRORED
+                                if current_trace.root_spans:
+                                    current_trace.root_spans[0].status = (
+                                        TraceSpanStatus.ERRORED
+                                    )
+                                    current_trace.root_spans[0].error = (
+                                        format_error_text(
+                                            DeepEvalError(
+                                                "Trace has metrics but no LLMTestCase (missing input/output). "
+                                                "Are you sure you called `update_current_trace()`?"
+                                            )
+                                        )
+                                    )
+                                if progress and pbar_eval_id is not None:
+                                    update_pbar(
+                                        progress,
+                                        pbar_eval_id,
+                                        advance=count_total_metrics_for_trace(
+                                            current_trace
+                                        ),
+                                    )
+                                skip_metrics_for_this_golden = True
+                        if not skip_metrics_for_this_golden:
+                            for metric in current_trace.metrics:
+                                metric.skipped = False
+                                metric.error = None
+                                if display_config.verbose_mode is not None:
+                                    metric.verbose_mode = (
+                                        display_config.verbose_mode
+                                    )
+                            trace_api.metrics_data = []
+                            for metric in current_trace.metrics:
+                                res = _execute_metric(
+                                    metric=metric,
+                                    test_case=llm_test_case,
+                                    show_metric_indicator=show_metric_indicator,
+                                    in_component=True,
+                                    error_config=error_config,
+                                )
+                                if res == "skip":
+                                    continue
+                                if not metric.skipped:
+                                    metric_data = create_metric_data(metric)
+                                    trace_api.metrics_data.append(metric_data)
+                                    api_test_case.update_metric_data(
+                                        metric_data
+                                    )
+                                    api_test_case.update_status(
+                                        metric_data.success
+                                    )
+                                    update_pbar(progress, pbar_eval_id)
+                    # Then handle span-level metrics
+                    dfs(current_trace.root_spans[0], progress, pbar_eval_id)
+            end_time = time.perf_counter()
+            run_duration = end_time - start_time
+            # Update test run
+            api_test_case.update_run_duration(run_duration)
+            test_run_manager.update_test_run(api_test_case, test_case)
+            main_result = create_test_result(api_test_case)
+            trace_results = extract_trace_test_results(trace_api)
+            unique_trace_results = filter_duplicate_results(
+                main_result, trace_results
+            )
+            test_results.append(main_result)
+            test_results.extend(unique_trace_results)
-                update_pbar(progress, pbar_id)
+            update_pbar(progress, pbar_id)
     try:
         if display_config.show_indicator and _use_bar_indicator:
@@ -1748,6 +2147,7 @@ def execute_agentic_test_cases_from_loop(
         local_trace_manager.evaluating = False
         local_trace_manager.traces_to_evaluate_order.clear()
         local_trace_manager.traces_to_evaluate.clear()
+        local_trace_manager.trace_uuid_to_golden.clear()
 def a_execute_agentic_test_cases_from_loop(
@@ -1820,39 +2220,137 @@ def a_execute_agentic_test_cases_from_loop(
             }
             def on_task_done(t: asyncio.Task):
+                cancelled = False
+                exc = None
+                trace = None
+                root = None
+                resolved_trace_from_task = False
+                resolved_root_from_task = False
+                # Task.exception() raises CancelledError if task was cancelled
+                try:
+                    exc = t.exception()
+                except asyncio.CancelledError:
+                    cancelled = True
+                    exc = None
+                meta = task_meta.get(t, {})
+                golden_index = meta.get("golden_index")
+                if golden_index is not None and 0 <= golden_index < len(
+                    goldens
+                ):
+                    golden = goldens[golden_index]
+                    def _mark_trace_error(trace, root, msg: str):
+                        now = time.perf_counter()
+                        trace.status = TraceSpanStatus.ERRORED
+                        # Close the trace so the API layer has a proper endTime
+                        if trace.end_time is None:
+                            trace.end_time = now
+                        if root:
+                            root.status = TraceSpanStatus.ERRORED
+                            root.error = msg
+                            if root.end_time is None:
+                                root.end_time = now
+                    if exc is not None:
+                        msg = format_error_text(exc)
+                        trace, root = _resolve_trace_and_root_for_task(t)
+                        resolved_trace_from_task = bool(trace)
+                        resolved_root_from_task = bool(root)
+                        if trace:
+                            _mark_trace_error(trace, root, msg)
+                        else:
+                            for (
+                                trace
+                            ) in trace_manager.integration_traces_to_evaluate:
+                                if (
+                                    trace_manager.trace_uuid_to_golden.get(
+                                        trace.uuid
+                                    )
+                                    is golden
+                                ):
+                                    root = _pick_root_for_marking(trace)
+                                    _mark_trace_error(trace, root, msg)
+                                    break
+                    elif cancelled or t.cancelled():
+                        cancel_exc = DeepEvalError(
+                            "Task was cancelled (likely due to timeout)."
+                        )
+                        msg = format_error_text(cancel_exc)
+                        trace, root = _resolve_trace_and_root_for_task(t)
+                        resolved_trace_from_task = bool(trace)
+                        resolved_root_from_task = bool(root)
+                        if trace:
+                            _mark_trace_error(trace, root, msg)
+                        else:
+                            for (
+                                trace
+                            ) in trace_manager.integration_traces_to_evaluate:
+                                if (
+                                    trace_manager.trace_uuid_to_golden.get(
+                                        trace.uuid
+                                    )
+                                    is golden
+                                ):
+                                    root = _pick_root_for_marking(trace)
+                                    _mark_trace_error(trace, root, msg)
+                                    break
                 if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     # Using info level here to make it easy to spot these logs.
-                    # We are gated by DEEPEVAL_DEBUG_ASYNC
-                    meta = task_meta.get(t, {})
+                    golden_name = meta.get("golden_name")
                     duration = time.perf_counter() - meta.get(
                         "started", started
                     )
-                    if t.cancelled():
+                    if cancelled or exc is not None:
+                        if not resolved_trace_from_task:
+                            logger.warning(
+                                "[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
+                                t.get_name(),
+                                golden_name,
+                            )
+                        elif not resolved_root_from_task:
+                            logger.warning(
+                                "[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
+                                t.get_name(),
+                                trace.uuid,
+                            )
+                    if cancelled:
                         logger.info(
                             "[deepeval] task CANCELLED %s after %.2fs meta=%r",
                             t.get_name(),
                             duration,
                             meta,
                         )
+                    elif exc is not None:
+                        logger.error(
+                            "[deepeval] task ERROR %s after %.2fs meta=%r",
+                            t.get_name(),
+                            duration,
+                            meta,
+                            exc_info=(
+                                type(exc),
+                                exc,
+                                getattr(exc, "__traceback__", None),
+                            ),
+                        )
                     else:
-                        exc = t.exception()
-                        if exc is not None:
-                            logger.error(
-                                "[deepeval] task ERROR %s after %.2fs meta=%r",
-                                t.get_name(),
-                                duration,
-                                meta,
-                                exc_info=(type(exc), exc, exc.__traceback__),
-                            )
-                        else:
-                            logger.info(
-                                "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
-                                t.get_name(),
-                                duration,
-                                meta.get("golden_index"),
-                            )
+                        logger.info(
+                            "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
+                            t.get_name(),
+                            duration,
+                            meta.get("golden_index"),
+                        )
+                try:
+                    trace_manager.task_bindings.pop(t, None)
+                except Exception:
+                    pass
                 update_pbar(progress, pbar_callback_id)
                 update_pbar(progress, pbar_id)
@@ -1897,6 +2395,7 @@ def a_execute_agentic_test_cases_from_loop(
                         timeout=_gather_timeout(),
                     )
                 )
             except asyncio.TimeoutError:
                 import traceback
@@ -1950,12 +2449,12 @@ def a_execute_agentic_test_cases_from_loop(
                     return
                 try:
+                    current_tasks = set()
                     # Find tasks that were created during this run but we didn’t track
                     current_tasks = loop.run_until_complete(_snapshot_tasks())
                 except RuntimeError:
                     # this might happen if the loop is already closing
-                    # nothing we can do
-                    return
+                    pass
                 leftovers = [
                     t
@@ -1965,33 +2464,32 @@ def a_execute_agentic_test_cases_from_loop(
                     and not t.done()
                 ]
-                if not leftovers:
-                    return
                 if get_settings().DEEPEVAL_DEBUG_ASYNC:
-                    logger.warning(
-                        "[deepeval] %d stray task(s) not tracked; cancelling...",
-                        len(leftovers),
-                    )
+                    if len(leftovers) > 0:
+                        logger.warning(
+                            "[deepeval] %d stray task(s) not tracked; cancelling...",
+                            len(leftovers),
+                        )
                     for t in leftovers:
                         meta = task_meta.get(t, {})
                         name = t.get_name()
                         logger.warning("  - STRAY %s meta=%s", name, meta)
-                for t in leftovers:
-                    t.cancel()
+                if leftovers:
+                    for t in leftovers:
+                        t.cancel()
-                # Drain strays so they don’t leak into the next iteration
-                try:
-                    loop.run_until_complete(
-                        asyncio.gather(*leftovers, return_exceptions=True)
-                    )
-                except RuntimeError:
-                    # If the loop is closing here, just continue
-                    if get_settings().DEEPEVAL_DEBUG_ASYNC:
-                        logger.warning(
-                            "[deepeval] failed to drain stray tasks because loop is closing"
+                    # Drain strays so they don’t leak into the next iteration
+                    try:
+                        loop.run_until_complete(
+                            asyncio.gather(*leftovers, return_exceptions=True)
                         )
+                    except RuntimeError:
+                        # If the loop is closing here, just continue
+                        if get_settings().DEEPEVAL_DEBUG_ASYNC:
+                            logger.warning(
+                                "[deepeval] failed to drain stray tasks because loop is closing"
+                            )
         # Evaluate traces
         if trace_manager.traces_to_evaluate:
@@ -2014,25 +2512,6 @@ def a_execute_agentic_test_cases_from_loop(
                     pbar_id=pbar_id,
                 )
             )
-        elif openai_test_case_pairs:
-            loop.run_until_complete(
-                _evaluate_test_case_pairs(
-                    test_case_pairs=openai_test_case_pairs,
-                    test_run=test_run,
-                    test_run_manager=test_run_manager,
-                    test_results=test_results,
-                    ignore_errors=error_config.ignore_errors,
-                    skip_on_missing_params=error_config.skip_on_missing_params,
-                    show_indicator=display_config.show_indicator,
-                    verbose_mode=display_config.verbose_mode,
-                    throttle_value=async_config.throttle_value,
-                    max_concurrent=async_config.max_concurrent,
-                    _use_bar_indicator=_use_bar_indicator,
-                    _is_assert_test=_is_assert_test,
-                    progress=progress,
-                    pbar_id=pbar_id,
-                )
-            )
         elif trace_manager.integration_traces_to_evaluate:
             loop.run_until_complete(
                 _a_evaluate_traces(
@@ -2106,6 +2585,7 @@ def a_execute_agentic_test_cases_from_loop(
         local_trace_manager.evaluating = False
         local_trace_manager.traces_to_evaluate_order.clear()
         local_trace_manager.traces_to_evaluate.clear()
+        local_trace_manager.trace_uuid_to_golden.clear()
 async def _a_evaluate_traces(
@@ -2129,11 +2609,32 @@ async def _a_evaluate_traces(
     async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            return await asyncio.wait_for(
+                func(*args, **kwargs),
+                timeout=_per_task_timeout(),
+            )
     eval_tasks = []
-    for count, trace in enumerate(traces_to_evaluate):
-        golden = goldens[count]
+    # Here, we will work off a fixed-set copy to avoid surprises from potential
+    # mid-iteration mutation
+    traces_snapshot = list(traces_to_evaluate or [])
+    for count, trace in enumerate(traces_snapshot):
+        # Prefer the explicit mapping from trace -> golden captured at trace creation.
+        golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
+        if not golden:
+            # trace started during evaluation_loop but the CURRENT_GOLDEN was
+            # not set for some reason. We can’t map it to a golden, so the best
+            # we can do is skip evaluation for this trace.
+            if (
+                logger.isEnabledFor(logging.DEBUG)
+                and get_settings().DEEPEVAL_VERBOSE_MODE
+            ):
+                logger.debug(
+                    "Skipping trace %s: no golden association found during evaluation_loop ",
+                    trace.uuid,
+                )
+            continue
         with capture_evaluation_run("golden"):
             task = execute_evals_with_semaphore(
                 func=_a_execute_agentic_test_case,
@@ -2154,7 +2655,18 @@ async def _a_evaluate_traces(
             )
             eval_tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
-    await asyncio.gather(*eval_tasks)
+    try:
+        await asyncio.wait_for(
+            asyncio.gather(*eval_tasks),
+            timeout=_gather_timeout(),
+        )
+    except asyncio.TimeoutError:
+        for t in eval_tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*eval_tasks, return_exceptions=True)
+        raise
 async def _evaluate_test_case_pairs(
@@ -2177,7 +2689,10 @@ async def _evaluate_test_case_pairs(
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
         async with semaphore:
-            return await func(*args, **kwargs)
+            return await asyncio.wait_for(
+                func(*args, **kwargs),
+                timeout=_per_task_timeout(),
+            )
     tasks = []
     for count, test_case_pair in enumerate(test_case_pairs):
@@ -2210,7 +2725,19 @@ async def _evaluate_test_case_pairs(
             )
             tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
-    await asyncio.gather(*tasks)
+    try:
+        await asyncio.wait_for(
+            asyncio.gather(*tasks),
+            timeout=_gather_timeout(),
+        )
+    except asyncio.TimeoutError:
+        # Cancel any still-pending tasks and drain them
+        for t in tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        raise
 def _execute_metric(
@@ -2225,13 +2752,14 @@ def _execute_metric(
             test_case,
             _show_indicator=show_metric_indicator,
             _in_component=in_component,
+            _log_metric_to_confident=False,
         )
     except MissingTestCaseParamsError as e:
         if error_config.skip_on_missing_params:
             return "skip"
         else:
             if error_config.ignore_errors:
-                metric.error = str(e)
+                metric.error = format_error_text(e)
                 metric.success = False
             else:
                 raise
@@ -2243,19 +2771,54 @@ def _execute_metric(
                 return "skip"
             else:
                 if error_config.ignore_errors:
-                    metric.error = str(e)
+                    metric.error = format_error_text(e)
                     metric.success = False
                 else:
                     raise
         except Exception as e:
             if error_config.ignore_errors:
-                metric.error = str(e)
+                metric.error = format_error_text(e)
                 metric.success = False
             else:
                 raise
     except Exception as e:
         if error_config.ignore_errors:
-            metric.error = str(e)
+            metric.error = format_error_text(e)
             metric.success = False
         else:
             raise
+def log_prompt(
+    llm_span: LlmSpan,
+    test_run_manager: TestRunManager,
+):
+    prompt = llm_span.prompt
+    if prompt is None:
+        return
+    span_hyperparameters = {}
+    prompt_version = prompt.version if is_confident() else None
+    key = f"{prompt.alias}_{prompt_version}"
+    span_hyperparameters[key] = prompt
+    test_run = test_run_manager.get_test_run()
+    if test_run.prompts is None:
+        test_run.prompts = []
+    if test_run.hyperparameters is None:
+        test_run.hyperparameters = {}
+    if key not in test_run.hyperparameters:
+        test_run.hyperparameters.update(
+            process_hyperparameters(span_hyperparameters, False)
+        )
+        existing_prompt_keys = {
+            f"{p.alias}_{p.version}" for p in test_run.prompts
+        }
+        new_prompts = process_prompts(span_hyperparameters)
+        for new_prompt in new_prompts:
+            new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
+            if new_prompt_key not in existing_prompt_keys:
+                test_run.prompts.append(new_prompt)
+    global_test_run_manager.save_test_run(TEMP_FILE_PATH)

deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl