deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -43,15 +43,19 @@ from deepeval.tracing.api import (
43
43
  )
44
44
  from deepeval.dataset import Golden
45
45
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
- from deepeval.errors import MissingTestCaseParamsError
46
+ from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
47
47
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import get_or_create_event_loop, shorten, len_medium
48
+ from deepeval.utils import (
49
+ get_or_create_event_loop,
50
+ shorten,
51
+ len_medium,
52
+ format_error_text,
53
+ )
49
54
  from deepeval.telemetry import capture_evaluation_run
50
55
  from deepeval.metrics import (
51
56
  BaseMetric,
52
57
  BaseConversationalMetric,
53
58
  BaseMultimodalMetric,
54
- TaskCompletionMetric,
55
59
  )
56
60
  from deepeval.metrics.indicator import (
57
61
  measure_metrics_with_indicator,
@@ -61,6 +65,7 @@ from deepeval.test_case import (
61
65
  ConversationalTestCase,
62
66
  MLLMTestCase,
63
67
  )
68
+ from deepeval.test_case.api import create_api_test_case
64
69
  from deepeval.test_run import (
65
70
  global_test_run_manager,
66
71
  LLMApiTestCase,
@@ -80,19 +85,127 @@ from deepeval.evaluate.utils import (
80
85
  create_api_trace,
81
86
  create_metric_data,
82
87
  create_test_result,
83
- create_api_test_case,
84
88
  count_metrics_in_trace,
89
+ count_total_metrics_for_trace,
90
+ count_metrics_in_span_subtree,
85
91
  extract_trace_test_results,
86
92
  )
87
93
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
- from deepeval.openai.utils import openai_test_case_pairs
89
- from deepeval.tracing.types import TestCaseMetricPair
94
+ from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
95
+ from deepeval.tracing.api import TraceSpanApiStatus
90
96
  from deepeval.config.settings import get_settings
91
-
97
+ from deepeval.test_run import TEMP_FILE_PATH
98
+ from deepeval.confident.api import is_confident
99
+ from deepeval.test_run.hyperparameters import (
100
+ process_hyperparameters,
101
+ process_prompts,
102
+ )
92
103
 
93
104
  logger = logging.getLogger(__name__)
94
105
 
95
106
 
107
+ def _skip_metrics_for_error(
108
+ span: Optional[BaseSpan] = None,
109
+ trace: Optional[Trace] = None,
110
+ ) -> bool:
111
+ # trace failure: skip everything under this trace
112
+ if trace is not None and trace.status == TraceSpanStatus.ERRORED:
113
+ return True
114
+ # span failure: skip this span’s metrics
115
+ if span is not None and span.status == TraceSpanStatus.ERRORED:
116
+ return True
117
+ return False
118
+
119
+
120
+ def _trace_error(current_trace: Trace) -> Optional[str]:
121
+ def _first_err(s: BaseSpan) -> Optional[str]:
122
+ if s.status == TraceSpanStatus.ERRORED and s.error:
123
+ return s.error
124
+ for c in s.children or []:
125
+ e = _first_err(c)
126
+ if e:
127
+ return e
128
+ return None
129
+
130
+ for root in current_trace.root_spans or []:
131
+ e = _first_err(root)
132
+ if e:
133
+ return e
134
+ return None
135
+
136
+
137
+ def _get_trace_by_uuid_anywhere(trace_uuid: str):
138
+ """
139
+ Resolver for a trace UUID across the manager's state.
140
+
141
+ First tries the manager's indexed lookup, which (covers active/in-flight traces,
142
+ then does a linear scan of the full `trace_manager.traces` list, which covers
143
+ traces that were recorded/closed earlier or not yet indexed. Returns
144
+ the concrete Trace object or None if not found.
145
+ """
146
+ tr = trace_manager.get_trace_by_uuid(trace_uuid)
147
+ if tr:
148
+ return tr
149
+ for tr in trace_manager.traces:
150
+ if tr.uuid == trace_uuid:
151
+ return tr
152
+ return None
153
+
154
+
155
+ def _pick_root_for_marking(trace):
156
+ """
157
+ Choose the most appropriate root span to annotate on error/cancel.
158
+
159
+ Heuristic:
160
+ - Prefer the most recent open root, which will have no `end_time` since this is the
161
+ span currently in flight.
162
+ - If none are open, use the last root span if it exists.
163
+ - If the trace has no roots, return None.
164
+
165
+ This favors marking the active root in multi root traces while remaining
166
+ stable for already closed traces.
167
+ """
168
+ open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
169
+ return (
170
+ open_roots[-1]
171
+ if open_roots
172
+ else (trace.root_spans[-1] if trace.root_spans else None)
173
+ )
174
+
175
+
176
+ def _resolve_trace_and_root_for_task(t: asyncio.Task):
177
+ """
178
+ Resolve trace and root for a completed task using the weak binding map.
179
+
180
+ Steps:
181
+ 1. Look up the task in `trace_manager.task_bindings` to get the
182
+ bound `trace_uuid` and, if available, `root_span_uuid`.
183
+ 2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
184
+ 3. If a bound root UUID exists, try to find that exact root on the trace.
185
+ 4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
186
+
187
+ Returns a trace / root tuple. Either may be `None` when no binding is
188
+ present. This function is used by `on_task_done` to robustly mark error/cancel
189
+ states without assuming a single root trace or a root that is still open.
190
+ """
191
+ binding = trace_manager.task_bindings.get(t) or {}
192
+ trace_uuid = binding.get("trace_uuid")
193
+ root_span_uuid = binding.get("root_span_uuid")
194
+
195
+ trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
196
+ root = None
197
+
198
+ if trace and root_span_uuid:
199
+ root = next(
200
+ (rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
201
+ )
202
+
203
+ if trace and root is None:
204
+ root = _pick_root_for_marking(trace)
205
+
206
+ return trace, root
207
+
208
+
96
209
  async def _snapshot_tasks():
97
210
  cur = asyncio.current_task()
98
211
  # `all_tasks` returns tasks for the current running loop only
@@ -111,6 +224,20 @@ def _gather_timeout() -> float:
111
224
  )
112
225
 
113
226
 
227
+ def filter_duplicate_results(
228
+ main_result: TestResult, results: List[TestResult]
229
+ ) -> List[TestResult]:
230
+ return [
231
+ result
232
+ for result in results
233
+ if not (
234
+ (result.input == main_result.input)
235
+ and (result.actual_output == main_result.actual_output)
236
+ and (result.metrics_data == main_result.metrics_data)
237
+ )
238
+ ]
239
+
240
+
114
241
  ###########################################
115
242
  ### E2E Evals #############################
116
243
  ###########################################
@@ -376,7 +503,10 @@ async def a_execute_test_cases(
376
503
 
377
504
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
378
505
  async with semaphore:
379
- return await func(*args, **kwargs)
506
+ return await asyncio.wait_for(
507
+ func(*args, **kwargs),
508
+ timeout=_per_task_timeout(),
509
+ )
380
510
 
381
511
  global_test_run_cache_manager.disable_write_cache = (
382
512
  cache_config.write_cache is False
@@ -495,7 +625,20 @@ async def a_execute_test_cases(
495
625
  tasks.append(asyncio.create_task(task))
496
626
 
497
627
  await asyncio.sleep(async_config.throttle_value)
498
- await asyncio.gather(*tasks)
628
+
629
+ try:
630
+ await asyncio.wait_for(
631
+ asyncio.gather(*tasks),
632
+ timeout=_gather_timeout(),
633
+ )
634
+ except asyncio.TimeoutError:
635
+ # Cancel any still-pending tasks and drain them
636
+ for t in tasks:
637
+ if not t.done():
638
+ t.cancel()
639
+ await asyncio.gather(*tasks, return_exceptions=True)
640
+ raise
641
+
499
642
  else:
500
643
  for test_case in test_cases:
501
644
  with capture_evaluation_run("test case"):
@@ -568,7 +711,19 @@ async def a_execute_test_cases(
568
711
  tasks.append(asyncio.create_task(task))
569
712
 
570
713
  await asyncio.sleep(async_config.throttle_value)
571
- await asyncio.gather(*tasks)
714
+
715
+ try:
716
+ await asyncio.wait_for(
717
+ asyncio.gather(*tasks),
718
+ timeout=_gather_timeout(),
719
+ )
720
+ except asyncio.TimeoutError:
721
+ # Cancel any still-pending tasks and drain them
722
+ for t in tasks:
723
+ if not t.done():
724
+ t.cancel()
725
+ await asyncio.gather(*tasks, return_exceptions=True)
726
+ raise
572
727
 
573
728
  return test_results
574
729
 
@@ -843,6 +998,7 @@ def execute_agentic_test_cases(
843
998
  _progress=progress,
844
999
  _pbar_callback_id=pbar_tags_id,
845
1000
  ):
1001
+
846
1002
  if asyncio.iscoroutinefunction(observed_callback):
847
1003
  loop = get_or_create_event_loop()
848
1004
  coro = observed_callback(golden.input)
@@ -894,14 +1050,16 @@ def execute_agentic_test_cases(
894
1050
  pbar_eval_id: Optional[int] = None,
895
1051
  ):
896
1052
  # Create API Span
897
- metrics: List[BaseMetric] = span.metrics
1053
+ metrics: List[BaseMetric] = list(span.metrics or [])
898
1054
  api_span: BaseApiSpan = (
899
1055
  trace_manager._convert_span_to_api_span(span)
900
1056
  )
1057
+
901
1058
  if isinstance(span, AgentSpan):
902
1059
  trace_api.agent_spans.append(api_span)
903
1060
  elif isinstance(span, LlmSpan):
904
1061
  trace_api.llm_spans.append(api_span)
1062
+ log_prompt(span, test_run_manager)
905
1063
  elif isinstance(span, RetrieverSpan):
906
1064
  trace_api.retriever_spans.append(api_span)
907
1065
  elif isinstance(span, ToolSpan):
@@ -909,14 +1067,27 @@ def execute_agentic_test_cases(
909
1067
  else:
910
1068
  trace_api.base_spans.append(api_span)
911
1069
 
1070
+ # Skip errored trace/span
1071
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1072
+ api_span.status = TraceSpanApiStatus.ERRORED
1073
+ api_span.error = span.error or _trace_error(
1074
+ current_trace
1075
+ )
1076
+ if progress and pbar_eval_id is not None:
1077
+ update_pbar(
1078
+ progress,
1079
+ pbar_eval_id,
1080
+ advance=count_metrics_in_span_subtree(span),
1081
+ )
1082
+ return
1083
+
912
1084
  for child in span.children:
913
1085
  dfs(child, progress, pbar_eval_id)
914
1086
 
915
- if span.metrics is None:
1087
+ if not span.metrics:
916
1088
  return
917
- has_task_completion = any(
918
- isinstance(metric, TaskCompletionMetric)
919
- for metric in span.metrics
1089
+ requires_trace = any(
1090
+ metric.requires_trace for metric in span.metrics
920
1091
  )
921
1092
 
922
1093
  llm_test_case = None
@@ -934,18 +1105,30 @@ def execute_agentic_test_cases(
934
1105
  tools_called=span.tools_called,
935
1106
  expected_tools=span.expected_tools,
936
1107
  )
937
- if llm_test_case is None and not has_task_completion:
938
- raise ValueError(
939
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
940
- )
941
1108
 
942
1109
  # add trace if task completion
943
- if has_task_completion:
1110
+ if requires_trace:
944
1111
  if llm_test_case is None:
945
1112
  llm_test_case = LLMTestCase(input="None")
946
1113
  llm_test_case._trace_dict = (
947
1114
  trace_manager.create_nested_spans_dict(span)
948
1115
  )
1116
+ else:
1117
+ if llm_test_case is None:
1118
+ api_span.status = TraceSpanApiStatus.ERRORED
1119
+ api_span.error = format_error_text(
1120
+ DeepEvalError(
1121
+ "Span has metrics but no LLMTestCase. "
1122
+ "Are you sure you called `update_current_span()`?"
1123
+ )
1124
+ )
1125
+ if progress and pbar_eval_id is not None:
1126
+ update_pbar(
1127
+ progress,
1128
+ pbar_eval_id,
1129
+ advance=count_metrics_in_span_subtree(span),
1130
+ )
1131
+ return
949
1132
 
950
1133
  # Preparing metric calculation
951
1134
  api_span.metrics_data = []
@@ -984,72 +1167,111 @@ def execute_agentic_test_cases(
984
1167
 
985
1168
  start_time = time.perf_counter()
986
1169
 
1170
+ skip_metrics_for_this_golden = False
987
1171
  # Handle trace-level metrics
988
- if current_trace.metrics:
989
- has_task_completion = any(
990
- isinstance(metric, TaskCompletionMetric)
991
- for metric in current_trace.metrics
992
- )
993
-
994
- llm_test_case = None
995
- if current_trace.input:
996
- llm_test_case = LLMTestCase(
997
- input=str(current_trace.input),
998
- actual_output=(
999
- str(current_trace.output)
1000
- if current_trace.output is not None
1001
- else None
1172
+ if _skip_metrics_for_error(trace=current_trace):
1173
+ trace_api.status = TraceSpanApiStatus.ERRORED
1174
+ if progress and pbar_eval_id is not None:
1175
+ update_pbar(
1176
+ progress,
1177
+ pbar_eval_id,
1178
+ advance=count_total_metrics_for_trace(
1179
+ current_trace
1002
1180
  ),
1003
- expected_output=current_trace.expected_output,
1004
- context=current_trace.context,
1005
- retrieval_context=current_trace.retrieval_context,
1006
- tools_called=current_trace.tools_called,
1007
- expected_tools=current_trace.expected_tools,
1008
1181
  )
1009
- if llm_test_case is None and not has_task_completion:
1010
- raise ValueError(
1011
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1182
+ else:
1183
+ if current_trace.metrics:
1184
+ requires_trace = any(
1185
+ metric.requires_trace
1186
+ for metric in current_trace.metrics
1012
1187
  )
1013
1188
 
1014
- if has_task_completion:
1015
- if llm_test_case is None:
1016
- llm_test_case = LLMTestCase(input="None")
1017
- llm_test_case._trace_dict = (
1018
- trace_manager.create_nested_spans_dict(
1019
- current_trace.root_spans[0]
1189
+ llm_test_case = None
1190
+ if current_trace.input:
1191
+ llm_test_case = LLMTestCase(
1192
+ input=str(current_trace.input),
1193
+ actual_output=(
1194
+ str(current_trace.output)
1195
+ if current_trace.output is not None
1196
+ else None
1197
+ ),
1198
+ expected_output=current_trace.expected_output,
1199
+ context=current_trace.context,
1200
+ retrieval_context=current_trace.retrieval_context,
1201
+ tools_called=current_trace.tools_called,
1202
+ expected_tools=current_trace.expected_tools,
1020
1203
  )
1021
- )
1022
-
1023
- for metric in current_trace.metrics:
1024
- metric.skipped = False
1025
- metric.error = None
1026
- if display_config.verbose_mode is not None:
1027
- metric.verbose_mode = display_config.verbose_mode
1028
-
1029
- trace_api.metrics_data = []
1030
- for metric in current_trace.metrics:
1031
- res = _execute_metric(
1032
- metric=metric,
1033
- test_case=llm_test_case,
1034
- show_metric_indicator=show_metric_indicator,
1035
- in_component=True,
1036
- error_config=error_config,
1037
- )
1038
- if res == "skip":
1039
- continue
1040
-
1041
- if not metric.skipped:
1042
- metric_data = create_metric_data(metric)
1043
- trace_api.metrics_data.append(metric_data)
1044
- api_test_case.update_metric_data(metric_data)
1045
- api_test_case.update_status(metric_data.success)
1046
- update_pbar(progress, pbar_eval_id)
1204
+ if requires_trace:
1205
+ if llm_test_case is None:
1206
+ llm_test_case = LLMTestCase(input="None")
1207
+ llm_test_case._trace_dict = (
1208
+ trace_manager.create_nested_spans_dict(
1209
+ current_trace.root_spans[0]
1210
+ )
1211
+ )
1212
+ else:
1213
+ if llm_test_case is None:
1214
+ current_trace.status = TraceSpanStatus.ERRORED
1215
+ trace_api.status = TraceSpanApiStatus.ERRORED
1216
+ if current_trace.root_spans:
1217
+ current_trace.root_spans[0].status = (
1218
+ TraceSpanStatus.ERRORED
1219
+ )
1220
+ current_trace.root_spans[0].error = (
1221
+ format_error_text(
1222
+ DeepEvalError(
1223
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1224
+ "Are you sure you called `update_current_trace()`?"
1225
+ )
1226
+ )
1227
+ )
1228
+ if progress and pbar_eval_id is not None:
1229
+ update_pbar(
1230
+ progress,
1231
+ pbar_eval_id,
1232
+ advance=count_total_metrics_for_trace(
1233
+ current_trace
1234
+ ),
1235
+ )
1236
+ skip_metrics_for_this_golden = True
1237
+
1238
+ if not skip_metrics_for_this_golden:
1239
+ for metric in current_trace.metrics:
1240
+ metric.skipped = False
1241
+ metric.error = None
1242
+ if display_config.verbose_mode is not None:
1243
+ metric.verbose_mode = (
1244
+ display_config.verbose_mode
1245
+ )
1246
+
1247
+ trace_api.metrics_data = []
1248
+ for metric in current_trace.metrics:
1249
+ res = _execute_metric(
1250
+ metric=metric,
1251
+ test_case=llm_test_case,
1252
+ show_metric_indicator=show_metric_indicator,
1253
+ in_component=True,
1254
+ error_config=error_config,
1255
+ )
1256
+ if res == "skip":
1257
+ continue
1258
+
1259
+ if not metric.skipped:
1260
+ metric_data = create_metric_data(metric)
1261
+ trace_api.metrics_data.append(metric_data)
1262
+ api_test_case.update_metric_data(
1263
+ metric_data
1264
+ )
1265
+ api_test_case.update_status(
1266
+ metric_data.success
1267
+ )
1268
+ update_pbar(progress, pbar_eval_id)
1269
+
1270
+ # Then handle span-level metrics
1271
+ dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1047
1272
 
1048
- # Then handle span-level metrics
1049
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1050
1273
  end_time = time.perf_counter()
1051
1274
  run_duration = end_time - start_time
1052
-
1053
1275
  # Update test run
1054
1276
  api_test_case.update_run_duration(run_duration)
1055
1277
  test_run_manager.update_test_run(api_test_case, test_case)
@@ -1097,7 +1319,10 @@ async def a_execute_agentic_test_cases(
1097
1319
 
1098
1320
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
1099
1321
  async with semaphore:
1100
- return await func(*args, **kwargs)
1322
+ return await asyncio.wait_for(
1323
+ func(*args, **kwargs),
1324
+ timeout=_per_task_timeout(),
1325
+ )
1101
1326
 
1102
1327
  test_run_manager = global_test_run_manager
1103
1328
  test_run_manager.save_to_disk = cache_config.write_cache
@@ -1144,7 +1369,19 @@ async def a_execute_agentic_test_cases(
1144
1369
  tasks.append(asyncio.create_task(task))
1145
1370
  await asyncio.sleep(async_config.throttle_value)
1146
1371
 
1147
- await asyncio.gather(*tasks)
1372
+ try:
1373
+ await asyncio.wait_for(
1374
+ asyncio.gather(*tasks),
1375
+ timeout=_gather_timeout(),
1376
+ )
1377
+ except asyncio.TimeoutError:
1378
+ # Cancel any still-pending tasks and drain them
1379
+ for t in tasks:
1380
+ if not t.done():
1381
+ t.cancel()
1382
+ await asyncio.gather(*tasks, return_exceptions=True)
1383
+ raise
1384
+
1148
1385
  else:
1149
1386
  for golden in goldens:
1150
1387
  with capture_evaluation_run("golden"):
@@ -1261,7 +1498,7 @@ async def _a_execute_agentic_test_case(
1261
1498
  )
1262
1499
 
1263
1500
  await _a_execute_trace_test_case(
1264
- trace=trace,
1501
+ trace=current_trace,
1265
1502
  trace_api=trace_api,
1266
1503
  api_test_case=api_test_case,
1267
1504
  ignore_errors=ignore_errors,
@@ -1273,9 +1510,10 @@ async def _a_execute_agentic_test_case(
1273
1510
  _use_bar_indicator=_use_bar_indicator,
1274
1511
  )
1275
1512
 
1276
- async def dfs(span: BaseSpan):
1513
+ async def dfs(trace: Trace, span: BaseSpan):
1277
1514
  await _a_execute_span_test_case(
1278
1515
  span=span,
1516
+ current_trace=trace,
1279
1517
  trace_api=trace_api,
1280
1518
  api_test_case=api_test_case,
1281
1519
  ignore_errors=ignore_errors,
@@ -1284,27 +1522,61 @@ async def _a_execute_agentic_test_case(
1284
1522
  verbose_mode=verbose_mode,
1285
1523
  progress=progress,
1286
1524
  pbar_eval_id=pbar_eval_id,
1525
+ test_run_manager=test_run_manager,
1287
1526
  _use_bar_indicator=_use_bar_indicator,
1288
1527
  )
1289
- child_tasks = [dfs(child) for child in span.children]
1528
+
1529
+ if _skip_metrics_for_error(span=span, trace=trace):
1530
+ return
1531
+
1532
+ child_tasks = [
1533
+ asyncio.create_task(dfs(trace, child)) for child in span.children
1534
+ ]
1290
1535
  if child_tasks:
1291
- await asyncio.gather(*child_tasks)
1536
+ try:
1537
+ await asyncio.wait_for(
1538
+ asyncio.gather(*child_tasks),
1539
+ timeout=_gather_timeout(),
1540
+ )
1541
+ except asyncio.TimeoutError:
1542
+ for t in child_tasks:
1543
+ if not t.done():
1544
+ t.cancel()
1545
+ await asyncio.gather(*child_tasks, return_exceptions=True)
1546
+ raise
1292
1547
 
1293
1548
  test_start_time = time.perf_counter()
1294
- await dfs(current_trace.root_spans[0])
1549
+
1550
+ if not _skip_metrics_for_error(trace=current_trace):
1551
+ if current_trace and current_trace.root_spans:
1552
+ await dfs(current_trace, current_trace.root_spans[0])
1553
+ else:
1554
+ if (
1555
+ logger.isEnabledFor(logging.DEBUG)
1556
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1557
+ ):
1558
+ logger.debug(
1559
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1560
+ current_trace.uuid if current_trace else None,
1561
+ )
1562
+
1295
1563
  test_end_time = time.perf_counter()
1296
1564
  run_duration = test_end_time - test_start_time
1297
1565
 
1298
1566
  api_test_case.update_run_duration(run_duration)
1299
1567
  test_run_manager.update_test_run(api_test_case, test_case)
1300
- test_results.append(create_test_result(api_test_case))
1301
- test_results.extend(extract_trace_test_results(trace_api))
1568
+ main_result = create_test_result(api_test_case)
1569
+ trace_results = extract_trace_test_results(trace_api)
1570
+ unique_trace_results = filter_duplicate_results(main_result, trace_results)
1571
+ test_results.append(main_result)
1572
+ test_results.extend(unique_trace_results)
1302
1573
 
1303
1574
  update_pbar(progress, pbar_id)
1304
1575
 
1305
1576
 
1306
1577
  async def _a_execute_span_test_case(
1307
1578
  span: BaseSpan,
1579
+ current_trace: Trace,
1308
1580
  trace_api: TraceApi,
1309
1581
  api_test_case: LLMApiTestCase,
1310
1582
  ignore_errors: bool,
@@ -1313,6 +1585,7 @@ async def _a_execute_span_test_case(
1313
1585
  verbose_mode: Optional[bool],
1314
1586
  progress: Optional[Progress],
1315
1587
  pbar_eval_id: Optional[int],
1588
+ test_run_manager: Optional[TestRunManager],
1316
1589
  _use_bar_indicator: bool,
1317
1590
  ):
1318
1591
  api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
@@ -1320,6 +1593,7 @@ async def _a_execute_span_test_case(
1320
1593
  trace_api.agent_spans.append(api_span)
1321
1594
  elif isinstance(span, LlmSpan):
1322
1595
  trace_api.llm_spans.append(api_span)
1596
+ log_prompt(span, test_run_manager)
1323
1597
  elif isinstance(span, RetrieverSpan):
1324
1598
  trace_api.retriever_spans.append(api_span)
1325
1599
  elif isinstance(span, ToolSpan):
@@ -1327,12 +1601,22 @@ async def _a_execute_span_test_case(
1327
1601
  else:
1328
1602
  trace_api.base_spans.append(api_span)
1329
1603
 
1330
- if span.metrics is None:
1604
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1605
+ api_span.status = TraceSpanApiStatus.ERRORED
1606
+ api_span.error = span.error or _trace_error(current_trace)
1607
+ if progress and pbar_eval_id is not None:
1608
+ update_pbar(
1609
+ progress,
1610
+ pbar_eval_id,
1611
+ advance=count_metrics_in_span_subtree(span),
1612
+ )
1331
1613
  return
1332
1614
 
1333
- has_task_completion = any(
1334
- isinstance(metric, TaskCompletionMetric) for metric in span.metrics
1335
- )
1615
+ metrics: List[BaseMetric] = list(span.metrics or [])
1616
+ if not metrics:
1617
+ return
1618
+
1619
+ requires_trace = any(metric.requires_trace for metric in metrics)
1336
1620
 
1337
1621
  llm_test_case = None
1338
1622
  if span.input:
@@ -1345,17 +1629,29 @@ async def _a_execute_span_test_case(
1345
1629
  tools_called=span.tools_called,
1346
1630
  expected_tools=span.expected_tools,
1347
1631
  )
1348
- if llm_test_case is None and not has_task_completion:
1349
- raise ValueError(
1350
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
1351
- )
1632
+
1633
+ if not requires_trace:
1634
+ if llm_test_case is None:
1635
+ api_span.status = TraceSpanApiStatus.ERRORED
1636
+ api_span.error = format_error_text(
1637
+ DeepEvalError(
1638
+ "Span has metrics but no LLMTestCase. "
1639
+ "Are you sure you called `update_current_span()`?"
1640
+ )
1641
+ )
1642
+ if progress and pbar_eval_id is not None:
1643
+ update_pbar(
1644
+ progress,
1645
+ pbar_eval_id,
1646
+ advance=count_metrics_in_span_subtree(span),
1647
+ )
1648
+ return
1352
1649
 
1353
1650
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1354
- metrics: List[BaseMetric] = span.metrics
1355
1651
  test_case: Optional[LLMTestCase] = llm_test_case
1356
1652
 
1357
1653
  # add trace if task completion
1358
- if has_task_completion:
1654
+ if requires_trace:
1359
1655
  if test_case is None:
1360
1656
  test_case = LLMTestCase(input="None")
1361
1657
  test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
@@ -1399,12 +1695,22 @@ async def _a_execute_trace_test_case(
1399
1695
  pbar_eval_id: Optional[int],
1400
1696
  _use_bar_indicator: bool,
1401
1697
  ):
1402
- if trace.metrics is None:
1698
+
1699
+ if _skip_metrics_for_error(trace=trace):
1700
+ trace_api.status = TraceSpanApiStatus.ERRORED
1701
+ if progress and pbar_eval_id is not None:
1702
+ update_pbar(
1703
+ progress,
1704
+ pbar_eval_id,
1705
+ advance=count_total_metrics_for_trace(trace),
1706
+ )
1403
1707
  return
1404
1708
 
1405
- has_task_completion = any(
1406
- isinstance(metric, TaskCompletionMetric) for metric in trace.metrics
1407
- )
1709
+ metrics: List[BaseMetric] = list(trace.metrics or [])
1710
+ if not metrics:
1711
+ return
1712
+
1713
+ requires_trace = any(metric.requires_trace for metric in metrics)
1408
1714
 
1409
1715
  llm_test_case = None
1410
1716
  if trace.input:
@@ -1419,17 +1725,32 @@ async def _a_execute_trace_test_case(
1419
1725
  tools_called=trace.tools_called,
1420
1726
  expected_tools=trace.expected_tools,
1421
1727
  )
1422
- if llm_test_case is None and not has_task_completion:
1423
- raise ValueError(
1424
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1425
- )
1728
+
1729
+ if not requires_trace:
1730
+ if llm_test_case is None:
1731
+ trace.status = TraceSpanStatus.ERRORED
1732
+ trace_api.status = TraceSpanApiStatus.ERRORED
1733
+ if trace.root_spans:
1734
+ trace.root_spans[0].status = TraceSpanStatus.ERRORED
1735
+ trace.root_spans[0].error = format_error_text(
1736
+ DeepEvalError(
1737
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1738
+ "Are you sure you called `update_current_trace()`?"
1739
+ )
1740
+ )
1741
+ if progress and pbar_eval_id is not None:
1742
+ update_pbar(
1743
+ progress,
1744
+ pbar_eval_id,
1745
+ advance=count_total_metrics_for_trace(trace),
1746
+ )
1747
+ return
1426
1748
 
1427
1749
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1428
- metrics: List[BaseMetric] = trace.metrics
1429
1750
  test_case: Optional[LLMTestCase] = llm_test_case
1430
1751
 
1431
1752
  # add trace if task completion
1432
- if has_task_completion:
1753
+ if requires_trace:
1433
1754
  if test_case is None:
1434
1755
  test_case = LLMTestCase(input="None")
1435
1756
  test_case._trace_dict = trace_manager.create_nested_spans_dict(
@@ -1559,15 +1880,17 @@ def execute_agentic_test_cases_from_loop(
1559
1880
  pbar_eval_id: Optional[int] = None,
1560
1881
  ):
1561
1882
  # Create API Span
1562
- metrics: List[BaseMetric] = span.metrics
1883
+ metrics: List[BaseMetric] = list(span.metrics or [])
1563
1884
 
1564
1885
  api_span: BaseApiSpan = (
1565
1886
  trace_manager._convert_span_to_api_span(span)
1566
1887
  )
1888
+
1567
1889
  if isinstance(span, AgentSpan):
1568
1890
  trace_api.agent_spans.append(api_span)
1569
1891
  elif isinstance(span, LlmSpan):
1570
1892
  trace_api.llm_spans.append(api_span)
1893
+ log_prompt(span, test_run_manager)
1571
1894
  elif isinstance(span, RetrieverSpan):
1572
1895
  trace_api.retriever_spans.append(api_span)
1573
1896
  elif isinstance(span, ToolSpan):
@@ -1575,9 +1898,30 @@ def execute_agentic_test_cases_from_loop(
1575
1898
  else:
1576
1899
  trace_api.base_spans.append(api_span)
1577
1900
 
1901
+ # Skip errored trace/span
1902
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1903
+ api_span.status = TraceSpanApiStatus.ERRORED
1904
+ api_span.error = span.error or _trace_error(
1905
+ current_trace
1906
+ )
1907
+ if progress and pbar_eval_id is not None:
1908
+ update_pbar(
1909
+ progress,
1910
+ pbar_eval_id,
1911
+ advance=count_metrics_in_span_subtree(span),
1912
+ )
1913
+ return
1914
+
1578
1915
  for child in span.children:
1579
1916
  dfs(child, progress, pbar_eval_id)
1580
1917
 
1918
+ if not span.metrics:
1919
+ return
1920
+
1921
+ requires_trace = any(
1922
+ metric.requires_trace for metric in metrics
1923
+ )
1924
+
1581
1925
  llm_test_case = None
1582
1926
  if span.input is not None:
1583
1927
  llm_test_case = LLMTestCase(
@@ -1593,20 +1937,29 @@ def execute_agentic_test_cases_from_loop(
1593
1937
  tools_called=span.tools_called,
1594
1938
  expected_tools=span.expected_tools,
1595
1939
  )
1596
- if span.metrics is None or llm_test_case is None:
1597
- return
1598
1940
 
1599
- has_task_completion = any(
1600
- isinstance(metric, TaskCompletionMetric)
1601
- for metric in metrics
1602
- )
1603
-
1604
- if has_task_completion:
1941
+ if requires_trace:
1605
1942
  if llm_test_case is None:
1606
1943
  llm_test_case = LLMTestCase(input="None")
1607
1944
  llm_test_case._trace_dict = (
1608
1945
  trace_manager.create_nested_spans_dict(span)
1609
1946
  )
1947
+ else:
1948
+ if llm_test_case is None:
1949
+ api_span.status = TraceSpanApiStatus.ERRORED
1950
+ api_span.error = format_error_text(
1951
+ DeepEvalError(
1952
+ "Span has metrics but no LLMTestCase. "
1953
+ "Are you sure you called `update_current_span()`?"
1954
+ )
1955
+ )
1956
+ if progress and pbar_eval_id is not None:
1957
+ update_pbar(
1958
+ progress,
1959
+ pbar_eval_id,
1960
+ advance=count_metrics_in_span_subtree(span),
1961
+ )
1962
+ return
1610
1963
 
1611
1964
  # Preparing metric calculation
1612
1965
  api_span.metrics_data = []
@@ -1650,77 +2003,123 @@ def execute_agentic_test_cases_from_loop(
1650
2003
  start_time = time.perf_counter()
1651
2004
 
1652
2005
  # Handle trace-level metrics
1653
- if current_trace.metrics:
1654
- has_task_completion = any(
1655
- isinstance(metric, TaskCompletionMetric)
1656
- for metric in current_trace.metrics
1657
- )
1658
-
1659
- llm_test_case = None
1660
- if current_trace.input:
1661
- llm_test_case = LLMTestCase(
1662
- input=str(current_trace.input),
1663
- actual_output=(
1664
- str(current_trace.output)
1665
- if current_trace.output is not None
1666
- else None
2006
+ skip_metrics_for_this_golden = False
2007
+ if _skip_metrics_for_error(trace=current_trace):
2008
+ trace_api.status = TraceSpanApiStatus.ERRORED
2009
+ if progress and pbar_eval_id is not None:
2010
+ update_pbar(
2011
+ progress,
2012
+ pbar_eval_id,
2013
+ advance=count_total_metrics_for_trace(
2014
+ current_trace
1667
2015
  ),
1668
- expected_output=current_trace.expected_output,
1669
- context=current_trace.context,
1670
- retrieval_context=current_trace.retrieval_context,
1671
- tools_called=current_trace.tools_called,
1672
- expected_tools=current_trace.expected_tools,
1673
2016
  )
1674
- if llm_test_case is None and not has_task_completion:
1675
- raise ValueError(
1676
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
2017
+ else:
2018
+ if current_trace.metrics:
2019
+ requires_trace = any(
2020
+ metric.requires_trace
2021
+ for metric in current_trace.metrics
1677
2022
  )
1678
2023
 
1679
- if has_task_completion:
1680
- if llm_test_case is None:
1681
- llm_test_case = LLMTestCase(input="None")
1682
- llm_test_case._trace_dict = (
1683
- trace_manager.create_nested_spans_dict(
1684
- current_trace.root_spans[0]
2024
+ llm_test_case = None
2025
+ if current_trace.input:
2026
+ llm_test_case = LLMTestCase(
2027
+ input=str(current_trace.input),
2028
+ actual_output=(
2029
+ str(current_trace.output)
2030
+ if current_trace.output is not None
2031
+ else None
2032
+ ),
2033
+ expected_output=current_trace.expected_output,
2034
+ context=current_trace.context,
2035
+ retrieval_context=current_trace.retrieval_context,
2036
+ tools_called=current_trace.tools_called,
2037
+ expected_tools=current_trace.expected_tools,
1685
2038
  )
1686
- )
1687
-
1688
- for metric in current_trace.metrics:
1689
- metric.skipped = False
1690
- metric.error = None
1691
- if display_config.verbose_mode is not None:
1692
- metric.verbose_mode = display_config.verbose_mode
1693
-
1694
- trace_api.metrics_data = []
1695
- for metric in current_trace.metrics:
1696
- res = _execute_metric(
1697
- metric=metric,
1698
- test_case=llm_test_case,
1699
- show_metric_indicator=show_metric_indicator,
1700
- in_component=True,
1701
- error_config=error_config,
1702
- )
1703
- if res == "skip":
1704
- continue
1705
-
1706
- if not metric.skipped:
1707
- metric_data = create_metric_data(metric)
1708
- trace_api.metrics_data.append(metric_data)
1709
- api_test_case.update_metric_data(metric_data)
1710
- api_test_case.update_status(metric_data.success)
1711
- update_pbar(progress, pbar_eval_id)
1712
2039
 
1713
- # Then handle span-level metrics
1714
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1715
- end_time = time.perf_counter()
1716
- run_duration = end_time - start_time
1717
-
1718
- # Update test run
1719
- api_test_case.update_run_duration(run_duration)
1720
- test_run_manager.update_test_run(api_test_case, test_case)
1721
- test_results.append(create_test_result(api_test_case))
2040
+ if requires_trace:
2041
+ if llm_test_case is None:
2042
+ llm_test_case = LLMTestCase(input="None")
2043
+ llm_test_case._trace_dict = (
2044
+ trace_manager.create_nested_spans_dict(
2045
+ current_trace.root_spans[0]
2046
+ )
2047
+ )
2048
+ else:
2049
+ if llm_test_case is None:
2050
+ current_trace.status = TraceSpanStatus.ERRORED
2051
+ trace_api.status = TraceSpanApiStatus.ERRORED
2052
+ if current_trace.root_spans:
2053
+ current_trace.root_spans[0].status = (
2054
+ TraceSpanStatus.ERRORED
2055
+ )
2056
+ current_trace.root_spans[0].error = (
2057
+ format_error_text(
2058
+ DeepEvalError(
2059
+ "Trace has metrics but no LLMTestCase (missing input/output). "
2060
+ "Are you sure you called `update_current_trace()`?"
2061
+ )
2062
+ )
2063
+ )
2064
+ if progress and pbar_eval_id is not None:
2065
+ update_pbar(
2066
+ progress,
2067
+ pbar_eval_id,
2068
+ advance=count_total_metrics_for_trace(
2069
+ current_trace
2070
+ ),
2071
+ )
2072
+ skip_metrics_for_this_golden = True
2073
+
2074
+ if not skip_metrics_for_this_golden:
2075
+ for metric in current_trace.metrics:
2076
+ metric.skipped = False
2077
+ metric.error = None
2078
+ if display_config.verbose_mode is not None:
2079
+ metric.verbose_mode = (
2080
+ display_config.verbose_mode
2081
+ )
2082
+
2083
+ trace_api.metrics_data = []
2084
+ for metric in current_trace.metrics:
2085
+ res = _execute_metric(
2086
+ metric=metric,
2087
+ test_case=llm_test_case,
2088
+ show_metric_indicator=show_metric_indicator,
2089
+ in_component=True,
2090
+ error_config=error_config,
2091
+ )
2092
+ if res == "skip":
2093
+ continue
2094
+
2095
+ if not metric.skipped:
2096
+ metric_data = create_metric_data(metric)
2097
+ trace_api.metrics_data.append(metric_data)
2098
+ api_test_case.update_metric_data(
2099
+ metric_data
2100
+ )
2101
+ api_test_case.update_status(
2102
+ metric_data.success
2103
+ )
2104
+ update_pbar(progress, pbar_eval_id)
2105
+
2106
+ # Then handle span-level metrics
2107
+ dfs(current_trace.root_spans[0], progress, pbar_eval_id)
2108
+
2109
+ end_time = time.perf_counter()
2110
+ run_duration = end_time - start_time
2111
+ # Update test run
2112
+ api_test_case.update_run_duration(run_duration)
2113
+ test_run_manager.update_test_run(api_test_case, test_case)
2114
+ main_result = create_test_result(api_test_case)
2115
+ trace_results = extract_trace_test_results(trace_api)
2116
+ unique_trace_results = filter_duplicate_results(
2117
+ main_result, trace_results
2118
+ )
2119
+ test_results.append(main_result)
2120
+ test_results.extend(unique_trace_results)
1722
2121
 
1723
- update_pbar(progress, pbar_id)
2122
+ update_pbar(progress, pbar_id)
1724
2123
 
1725
2124
  try:
1726
2125
  if display_config.show_indicator and _use_bar_indicator:
@@ -1748,6 +2147,7 @@ def execute_agentic_test_cases_from_loop(
1748
2147
  local_trace_manager.evaluating = False
1749
2148
  local_trace_manager.traces_to_evaluate_order.clear()
1750
2149
  local_trace_manager.traces_to_evaluate.clear()
2150
+ local_trace_manager.trace_uuid_to_golden.clear()
1751
2151
 
1752
2152
 
1753
2153
  def a_execute_agentic_test_cases_from_loop(
@@ -1820,39 +2220,137 @@ def a_execute_agentic_test_cases_from_loop(
1820
2220
  }
1821
2221
 
1822
2222
  def on_task_done(t: asyncio.Task):
2223
+ cancelled = False
2224
+ exc = None
2225
+ trace = None
2226
+ root = None
2227
+ resolved_trace_from_task = False
2228
+ resolved_root_from_task = False
2229
+
2230
+ # Task.exception() raises CancelledError if task was cancelled
2231
+ try:
2232
+ exc = t.exception()
2233
+ except asyncio.CancelledError:
2234
+ cancelled = True
2235
+ exc = None
2236
+
2237
+ meta = task_meta.get(t, {})
2238
+ golden_index = meta.get("golden_index")
2239
+
2240
+ if golden_index is not None and 0 <= golden_index < len(
2241
+ goldens
2242
+ ):
2243
+ golden = goldens[golden_index]
2244
+
2245
+ def _mark_trace_error(trace, root, msg: str):
2246
+ now = time.perf_counter()
2247
+ trace.status = TraceSpanStatus.ERRORED
2248
+ # Close the trace so the API layer has a proper endTime
2249
+ if trace.end_time is None:
2250
+ trace.end_time = now
2251
+ if root:
2252
+ root.status = TraceSpanStatus.ERRORED
2253
+ root.error = msg
2254
+ if root.end_time is None:
2255
+ root.end_time = now
2256
+
2257
+ if exc is not None:
2258
+ msg = format_error_text(exc)
2259
+ trace, root = _resolve_trace_and_root_for_task(t)
2260
+ resolved_trace_from_task = bool(trace)
2261
+ resolved_root_from_task = bool(root)
2262
+ if trace:
2263
+ _mark_trace_error(trace, root, msg)
2264
+ else:
2265
+ for (
2266
+ trace
2267
+ ) in trace_manager.integration_traces_to_evaluate:
2268
+ if (
2269
+ trace_manager.trace_uuid_to_golden.get(
2270
+ trace.uuid
2271
+ )
2272
+ is golden
2273
+ ):
2274
+ root = _pick_root_for_marking(trace)
2275
+ _mark_trace_error(trace, root, msg)
2276
+ break
2277
+
2278
+ elif cancelled or t.cancelled():
2279
+ cancel_exc = DeepEvalError(
2280
+ "Task was cancelled (likely due to timeout)."
2281
+ )
2282
+ msg = format_error_text(cancel_exc)
2283
+ trace, root = _resolve_trace_and_root_for_task(t)
2284
+ resolved_trace_from_task = bool(trace)
2285
+ resolved_root_from_task = bool(root)
2286
+ if trace:
2287
+ _mark_trace_error(trace, root, msg)
2288
+ else:
2289
+ for (
2290
+ trace
2291
+ ) in trace_manager.integration_traces_to_evaluate:
2292
+ if (
2293
+ trace_manager.trace_uuid_to_golden.get(
2294
+ trace.uuid
2295
+ )
2296
+ is golden
2297
+ ):
2298
+ root = _pick_root_for_marking(trace)
2299
+ _mark_trace_error(trace, root, msg)
2300
+ break
2301
+
1823
2302
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1824
2303
  # Using info level here to make it easy to spot these logs.
1825
- # We are gated by DEEPEVAL_DEBUG_ASYNC
1826
- meta = task_meta.get(t, {})
2304
+ golden_name = meta.get("golden_name")
1827
2305
  duration = time.perf_counter() - meta.get(
1828
2306
  "started", started
1829
2307
  )
1830
2308
 
1831
- if t.cancelled():
2309
+ if cancelled or exc is not None:
2310
+ if not resolved_trace_from_task:
2311
+ logger.warning(
2312
+ "[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
2313
+ t.get_name(),
2314
+ golden_name,
2315
+ )
2316
+ elif not resolved_root_from_task:
2317
+ logger.warning(
2318
+ "[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
2319
+ t.get_name(),
2320
+ trace.uuid,
2321
+ )
2322
+
2323
+ if cancelled:
1832
2324
  logger.info(
1833
2325
  "[deepeval] task CANCELLED %s after %.2fs meta=%r",
1834
2326
  t.get_name(),
1835
2327
  duration,
1836
2328
  meta,
1837
2329
  )
2330
+ elif exc is not None:
2331
+ logger.error(
2332
+ "[deepeval] task ERROR %s after %.2fs meta=%r",
2333
+ t.get_name(),
2334
+ duration,
2335
+ meta,
2336
+ exc_info=(
2337
+ type(exc),
2338
+ exc,
2339
+ getattr(exc, "__traceback__", None),
2340
+ ),
2341
+ )
1838
2342
  else:
1839
- exc = t.exception()
1840
- if exc is not None:
1841
- logger.error(
1842
- "[deepeval] task ERROR %s after %.2fs meta=%r",
1843
- t.get_name(),
1844
- duration,
1845
- meta,
1846
- exc_info=(type(exc), exc, exc.__traceback__),
1847
- )
1848
- else:
1849
- logger.info(
1850
- "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
1851
- t.get_name(),
1852
- duration,
1853
- meta.get("golden_index"),
1854
- )
2343
+ logger.info(
2344
+ "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
2345
+ t.get_name(),
2346
+ duration,
2347
+ meta.get("golden_index"),
2348
+ )
1855
2349
 
2350
+ try:
2351
+ trace_manager.task_bindings.pop(t, None)
2352
+ except Exception:
2353
+ pass
1856
2354
  update_pbar(progress, pbar_callback_id)
1857
2355
  update_pbar(progress, pbar_id)
1858
2356
 
@@ -1897,6 +2395,7 @@ def a_execute_agentic_test_cases_from_loop(
1897
2395
  timeout=_gather_timeout(),
1898
2396
  )
1899
2397
  )
2398
+
1900
2399
  except asyncio.TimeoutError:
1901
2400
  import traceback
1902
2401
 
@@ -1950,12 +2449,12 @@ def a_execute_agentic_test_cases_from_loop(
1950
2449
  return
1951
2450
 
1952
2451
  try:
2452
+ current_tasks = set()
1953
2453
  # Find tasks that were created during this run but we didn’t track
1954
2454
  current_tasks = loop.run_until_complete(_snapshot_tasks())
1955
2455
  except RuntimeError:
1956
2456
  # this might happen if the loop is already closing
1957
- # nothing we can do
1958
- return
2457
+ pass
1959
2458
 
1960
2459
  leftovers = [
1961
2460
  t
@@ -1965,33 +2464,32 @@ def a_execute_agentic_test_cases_from_loop(
1965
2464
  and not t.done()
1966
2465
  ]
1967
2466
 
1968
- if not leftovers:
1969
- return
1970
-
1971
2467
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1972
- logger.warning(
1973
- "[deepeval] %d stray task(s) not tracked; cancelling...",
1974
- len(leftovers),
1975
- )
2468
+ if len(leftovers) > 0:
2469
+ logger.warning(
2470
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
2471
+ len(leftovers),
2472
+ )
1976
2473
  for t in leftovers:
1977
2474
  meta = task_meta.get(t, {})
1978
2475
  name = t.get_name()
1979
2476
  logger.warning(" - STRAY %s meta=%s", name, meta)
1980
2477
 
1981
- for t in leftovers:
1982
- t.cancel()
2478
+ if leftovers:
2479
+ for t in leftovers:
2480
+ t.cancel()
1983
2481
 
1984
- # Drain strays so they don’t leak into the next iteration
1985
- try:
1986
- loop.run_until_complete(
1987
- asyncio.gather(*leftovers, return_exceptions=True)
1988
- )
1989
- except RuntimeError:
1990
- # If the loop is closing here, just continue
1991
- if get_settings().DEEPEVAL_DEBUG_ASYNC:
1992
- logger.warning(
1993
- "[deepeval] failed to drain stray tasks because loop is closing"
2482
+ # Drain strays so they don’t leak into the next iteration
2483
+ try:
2484
+ loop.run_until_complete(
2485
+ asyncio.gather(*leftovers, return_exceptions=True)
1994
2486
  )
2487
+ except RuntimeError:
2488
+ # If the loop is closing here, just continue
2489
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
2490
+ logger.warning(
2491
+ "[deepeval] failed to drain stray tasks because loop is closing"
2492
+ )
1995
2493
 
1996
2494
  # Evaluate traces
1997
2495
  if trace_manager.traces_to_evaluate:
@@ -2014,25 +2512,6 @@ def a_execute_agentic_test_cases_from_loop(
2014
2512
  pbar_id=pbar_id,
2015
2513
  )
2016
2514
  )
2017
- elif openai_test_case_pairs:
2018
- loop.run_until_complete(
2019
- _evaluate_test_case_pairs(
2020
- test_case_pairs=openai_test_case_pairs,
2021
- test_run=test_run,
2022
- test_run_manager=test_run_manager,
2023
- test_results=test_results,
2024
- ignore_errors=error_config.ignore_errors,
2025
- skip_on_missing_params=error_config.skip_on_missing_params,
2026
- show_indicator=display_config.show_indicator,
2027
- verbose_mode=display_config.verbose_mode,
2028
- throttle_value=async_config.throttle_value,
2029
- max_concurrent=async_config.max_concurrent,
2030
- _use_bar_indicator=_use_bar_indicator,
2031
- _is_assert_test=_is_assert_test,
2032
- progress=progress,
2033
- pbar_id=pbar_id,
2034
- )
2035
- )
2036
2515
  elif trace_manager.integration_traces_to_evaluate:
2037
2516
  loop.run_until_complete(
2038
2517
  _a_evaluate_traces(
@@ -2106,6 +2585,7 @@ def a_execute_agentic_test_cases_from_loop(
2106
2585
  local_trace_manager.evaluating = False
2107
2586
  local_trace_manager.traces_to_evaluate_order.clear()
2108
2587
  local_trace_manager.traces_to_evaluate.clear()
2588
+ local_trace_manager.trace_uuid_to_golden.clear()
2109
2589
 
2110
2590
 
2111
2591
  async def _a_evaluate_traces(
@@ -2129,11 +2609,32 @@ async def _a_evaluate_traces(
2129
2609
 
2130
2610
  async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
2131
2611
  async with semaphore:
2132
- return await func(*args, **kwargs)
2612
+ return await asyncio.wait_for(
2613
+ func(*args, **kwargs),
2614
+ timeout=_per_task_timeout(),
2615
+ )
2133
2616
 
2134
2617
  eval_tasks = []
2135
- for count, trace in enumerate(traces_to_evaluate):
2136
- golden = goldens[count]
2618
+ # Here, we will work off a fixed-set copy to avoid surprises from potential
2619
+ # mid-iteration mutation
2620
+ traces_snapshot = list(traces_to_evaluate or [])
2621
+
2622
+ for count, trace in enumerate(traces_snapshot):
2623
+ # Prefer the explicit mapping from trace -> golden captured at trace creation.
2624
+ golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
2625
+ if not golden:
2626
+ # trace started during evaluation_loop but the CURRENT_GOLDEN was
2627
+ # not set for some reason. We can’t map it to a golden, so the best
2628
+ # we can do is skip evaluation for this trace.
2629
+ if (
2630
+ logger.isEnabledFor(logging.DEBUG)
2631
+ and get_settings().DEEPEVAL_VERBOSE_MODE
2632
+ ):
2633
+ logger.debug(
2634
+ "Skipping trace %s: no golden association found during evaluation_loop ",
2635
+ trace.uuid,
2636
+ )
2637
+ continue
2137
2638
  with capture_evaluation_run("golden"):
2138
2639
  task = execute_evals_with_semaphore(
2139
2640
  func=_a_execute_agentic_test_case,
@@ -2154,7 +2655,18 @@ async def _a_evaluate_traces(
2154
2655
  )
2155
2656
  eval_tasks.append(asyncio.create_task(task))
2156
2657
  await asyncio.sleep(throttle_value)
2157
- await asyncio.gather(*eval_tasks)
2658
+
2659
+ try:
2660
+ await asyncio.wait_for(
2661
+ asyncio.gather(*eval_tasks),
2662
+ timeout=_gather_timeout(),
2663
+ )
2664
+ except asyncio.TimeoutError:
2665
+ for t in eval_tasks:
2666
+ if not t.done():
2667
+ t.cancel()
2668
+ await asyncio.gather(*eval_tasks, return_exceptions=True)
2669
+ raise
2158
2670
 
2159
2671
 
2160
2672
  async def _evaluate_test_case_pairs(
@@ -2177,7 +2689,10 @@ async def _evaluate_test_case_pairs(
2177
2689
 
2178
2690
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
2179
2691
  async with semaphore:
2180
- return await func(*args, **kwargs)
2692
+ return await asyncio.wait_for(
2693
+ func(*args, **kwargs),
2694
+ timeout=_per_task_timeout(),
2695
+ )
2181
2696
 
2182
2697
  tasks = []
2183
2698
  for count, test_case_pair in enumerate(test_case_pairs):
@@ -2210,7 +2725,19 @@ async def _evaluate_test_case_pairs(
2210
2725
  )
2211
2726
  tasks.append(asyncio.create_task(task))
2212
2727
  await asyncio.sleep(throttle_value)
2213
- await asyncio.gather(*tasks)
2728
+
2729
+ try:
2730
+ await asyncio.wait_for(
2731
+ asyncio.gather(*tasks),
2732
+ timeout=_gather_timeout(),
2733
+ )
2734
+ except asyncio.TimeoutError:
2735
+ # Cancel any still-pending tasks and drain them
2736
+ for t in tasks:
2737
+ if not t.done():
2738
+ t.cancel()
2739
+ await asyncio.gather(*tasks, return_exceptions=True)
2740
+ raise
2214
2741
 
2215
2742
 
2216
2743
  def _execute_metric(
@@ -2225,13 +2752,14 @@ def _execute_metric(
2225
2752
  test_case,
2226
2753
  _show_indicator=show_metric_indicator,
2227
2754
  _in_component=in_component,
2755
+ _log_metric_to_confident=False,
2228
2756
  )
2229
2757
  except MissingTestCaseParamsError as e:
2230
2758
  if error_config.skip_on_missing_params:
2231
2759
  return "skip"
2232
2760
  else:
2233
2761
  if error_config.ignore_errors:
2234
- metric.error = str(e)
2762
+ metric.error = format_error_text(e)
2235
2763
  metric.success = False
2236
2764
  else:
2237
2765
  raise
@@ -2243,19 +2771,54 @@ def _execute_metric(
2243
2771
  return "skip"
2244
2772
  else:
2245
2773
  if error_config.ignore_errors:
2246
- metric.error = str(e)
2774
+ metric.error = format_error_text(e)
2247
2775
  metric.success = False
2248
2776
  else:
2249
2777
  raise
2250
2778
  except Exception as e:
2251
2779
  if error_config.ignore_errors:
2252
- metric.error = str(e)
2780
+ metric.error = format_error_text(e)
2253
2781
  metric.success = False
2254
2782
  else:
2255
2783
  raise
2256
2784
  except Exception as e:
2257
2785
  if error_config.ignore_errors:
2258
- metric.error = str(e)
2786
+ metric.error = format_error_text(e)
2259
2787
  metric.success = False
2260
2788
  else:
2261
2789
  raise
2790
+
2791
+
2792
+ def log_prompt(
2793
+ llm_span: LlmSpan,
2794
+ test_run_manager: TestRunManager,
2795
+ ):
2796
+ prompt = llm_span.prompt
2797
+ if prompt is None:
2798
+ return
2799
+
2800
+ span_hyperparameters = {}
2801
+ prompt_version = prompt.version if is_confident() else None
2802
+ key = f"{prompt.alias}_{prompt_version}"
2803
+ span_hyperparameters[key] = prompt
2804
+
2805
+ test_run = test_run_manager.get_test_run()
2806
+ if test_run.prompts is None:
2807
+ test_run.prompts = []
2808
+ if test_run.hyperparameters is None:
2809
+ test_run.hyperparameters = {}
2810
+
2811
+ if key not in test_run.hyperparameters:
2812
+ test_run.hyperparameters.update(
2813
+ process_hyperparameters(span_hyperparameters, False)
2814
+ )
2815
+ existing_prompt_keys = {
2816
+ f"{p.alias}_{p.version}" for p in test_run.prompts
2817
+ }
2818
+ new_prompts = process_prompts(span_hyperparameters)
2819
+ for new_prompt in new_prompts:
2820
+ new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
2821
+ if new_prompt_key not in existing_prompt_keys:
2822
+ test_run.prompts.append(new_prompt)
2823
+
2824
+ global_test_run_manager.save_test_run(TEMP_FILE_PATH)