deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/errors.py +20 -2
  3. deepeval/evaluate/execute.py +725 -217
  4. deepeval/evaluate/types.py +1 -0
  5. deepeval/evaluate/utils.py +13 -3
  6. deepeval/integrations/crewai/__init__.py +2 -1
  7. deepeval/integrations/crewai/tool.py +71 -0
  8. deepeval/integrations/llama_index/__init__.py +0 -4
  9. deepeval/integrations/llama_index/handler.py +20 -21
  10. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  11. deepeval/metrics/__init__.py +13 -0
  12. deepeval/metrics/base_metric.py +1 -0
  13. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  14. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  15. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  16. deepeval/metrics/dag/schema.py +1 -1
  17. deepeval/metrics/dag/templates.py +2 -2
  18. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  19. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  20. deepeval/metrics/goal_accuracy/schema.py +17 -0
  21. deepeval/metrics/goal_accuracy/template.py +235 -0
  22. deepeval/metrics/hallucination/hallucination.py +8 -8
  23. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  24. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  25. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  26. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  27. deepeval/metrics/plan_adherence/__init__.py +1 -0
  28. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  29. deepeval/metrics/plan_adherence/schema.py +11 -0
  30. deepeval/metrics/plan_adherence/template.py +170 -0
  31. deepeval/metrics/plan_quality/__init__.py +1 -0
  32. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  33. deepeval/metrics/plan_quality/schema.py +11 -0
  34. deepeval/metrics/plan_quality/template.py +101 -0
  35. deepeval/metrics/step_efficiency/__init__.py +1 -0
  36. deepeval/metrics/step_efficiency/schema.py +11 -0
  37. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  38. deepeval/metrics/step_efficiency/template.py +256 -0
  39. deepeval/metrics/task_completion/task_completion.py +1 -0
  40. deepeval/metrics/tool_correctness/schema.py +6 -0
  41. deepeval/metrics/tool_correctness/template.py +88 -0
  42. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  43. deepeval/metrics/tool_use/__init__.py +1 -0
  44. deepeval/metrics/tool_use/schema.py +19 -0
  45. deepeval/metrics/tool_use/template.py +220 -0
  46. deepeval/metrics/tool_use/tool_use.py +458 -0
  47. deepeval/metrics/topic_adherence/__init__.py +1 -0
  48. deepeval/metrics/topic_adherence/schema.py +16 -0
  49. deepeval/metrics/topic_adherence/template.py +162 -0
  50. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  52. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  53. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  54. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  55. deepeval/openai/extractors.py +61 -16
  56. deepeval/openai/patch.py +8 -12
  57. deepeval/openai/types.py +1 -1
  58. deepeval/openai/utils.py +108 -1
  59. deepeval/prompt/prompt.py +1 -0
  60. deepeval/prompt/utils.py +43 -14
  61. deepeval/synthesizer/synthesizer.py +11 -10
  62. deepeval/test_case/llm_test_case.py +6 -2
  63. deepeval/test_run/test_run.py +190 -207
  64. deepeval/tracing/__init__.py +2 -1
  65. deepeval/tracing/otel/exporter.py +3 -4
  66. deepeval/tracing/otel/utils.py +23 -4
  67. deepeval/tracing/trace_context.py +53 -38
  68. deepeval/tracing/tracing.py +23 -0
  69. deepeval/tracing/types.py +16 -14
  70. deepeval/utils.py +21 -0
  71. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  72. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
  73. deepeval/integrations/llama_index/agent/patched.py +0 -68
  74. deepeval/tracing/message_types/__init__.py +0 -10
  75. deepeval/tracing/message_types/base.py +0 -6
  76. deepeval/tracing/message_types/messages.py +0 -14
  77. deepeval/tracing/message_types/tools.py +0 -18
  78. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  79. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  80. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -43,15 +43,19 @@ from deepeval.tracing.api import (
43
43
  )
44
44
  from deepeval.dataset import Golden
45
45
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
- from deepeval.errors import MissingTestCaseParamsError
46
+ from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
47
47
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import get_or_create_event_loop, shorten, len_medium
48
+ from deepeval.utils import (
49
+ get_or_create_event_loop,
50
+ shorten,
51
+ len_medium,
52
+ format_error_text,
53
+ )
49
54
  from deepeval.telemetry import capture_evaluation_run
50
55
  from deepeval.metrics import (
51
56
  BaseMetric,
52
57
  BaseConversationalMetric,
53
58
  BaseMultimodalMetric,
54
- TaskCompletionMetric,
55
59
  )
56
60
  from deepeval.metrics.indicator import (
57
61
  measure_metrics_with_indicator,
@@ -82,10 +86,13 @@ from deepeval.evaluate.utils import (
82
86
  create_metric_data,
83
87
  create_test_result,
84
88
  count_metrics_in_trace,
89
+ count_total_metrics_for_trace,
90
+ count_metrics_in_span_subtree,
85
91
  extract_trace_test_results,
86
92
  )
87
93
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
- from deepeval.tracing.types import TestCaseMetricPair
94
+ from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
95
+ from deepeval.tracing.api import TraceSpanApiStatus
89
96
  from deepeval.config.settings import get_settings
90
97
  from deepeval.test_run import TEMP_FILE_PATH
91
98
  from deepeval.confident.api import is_confident
@@ -97,6 +104,108 @@ from deepeval.test_run.hyperparameters import (
97
104
  logger = logging.getLogger(__name__)
98
105
 
99
106
 
107
+ def _skip_metrics_for_error(
108
+ span: Optional[BaseSpan] = None,
109
+ trace: Optional[Trace] = None,
110
+ ) -> bool:
111
+ # trace failure: skip everything under this trace
112
+ if trace is not None and trace.status == TraceSpanStatus.ERRORED:
113
+ return True
114
+ # span failure: skip this span’s metrics
115
+ if span is not None and span.status == TraceSpanStatus.ERRORED:
116
+ return True
117
+ return False
118
+
119
+
120
+ def _trace_error(current_trace: Trace) -> Optional[str]:
121
+ def _first_err(s: BaseSpan) -> Optional[str]:
122
+ if s.status == TraceSpanStatus.ERRORED and s.error:
123
+ return s.error
124
+ for c in s.children or []:
125
+ e = _first_err(c)
126
+ if e:
127
+ return e
128
+ return None
129
+
130
+ for root in current_trace.root_spans or []:
131
+ e = _first_err(root)
132
+ if e:
133
+ return e
134
+ return None
135
+
136
+
137
+ def _get_trace_by_uuid_anywhere(trace_uuid: str):
138
+ """
139
+ Resolver for a trace UUID across the manager's state.
140
+
141
+ First tries the manager's indexed lookup, which (covers active/in-flight traces,
142
+ then does a linear scan of the full `trace_manager.traces` list, which covers
143
+ traces that were recorded/closed earlier or not yet indexed. Returns
144
+ the concrete Trace object or None if not found.
145
+ """
146
+ tr = trace_manager.get_trace_by_uuid(trace_uuid)
147
+ if tr:
148
+ return tr
149
+ for tr in trace_manager.traces:
150
+ if tr.uuid == trace_uuid:
151
+ return tr
152
+ return None
153
+
154
+
155
+ def _pick_root_for_marking(trace):
156
+ """
157
+ Choose the most appropriate root span to annotate on error/cancel.
158
+
159
+ Heuristic:
160
+ - Prefer the most recent open root, which will have no `end_time` since this is the
161
+ span currently in flight.
162
+ - If none are open, use the last root span if it exists.
163
+ - If the trace has no roots, return None.
164
+
165
+ This favors marking the active root in multi root traces while remaining
166
+ stable for already closed traces.
167
+ """
168
+ open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
169
+ return (
170
+ open_roots[-1]
171
+ if open_roots
172
+ else (trace.root_spans[-1] if trace.root_spans else None)
173
+ )
174
+
175
+
176
+ def _resolve_trace_and_root_for_task(t: asyncio.Task):
177
+ """
178
+ Resolve trace and root for a completed task using the weak binding map.
179
+
180
+ Steps:
181
+ 1. Look up the task in `trace_manager.task_bindings` to get the
182
+ bound `trace_uuid` and, if available, `root_span_uuid`.
183
+ 2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
184
+ 3. If a bound root UUID exists, try to find that exact root on the trace.
185
+ 4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
186
+
187
+ Returns a trace / root tuple. Either may be `None` when no binding is
188
+ present. This function is used by `on_task_done` to robustly mark error/cancel
189
+ states without assuming a single root trace or a root that is still open.
190
+ """
191
+ binding = trace_manager.task_bindings.get(t) or {}
192
+ trace_uuid = binding.get("trace_uuid")
193
+ root_span_uuid = binding.get("root_span_uuid")
194
+
195
+ trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
196
+ root = None
197
+
198
+ if trace and root_span_uuid:
199
+ root = next(
200
+ (rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
201
+ )
202
+
203
+ if trace and root is None:
204
+ root = _pick_root_for_marking(trace)
205
+
206
+ return trace, root
207
+
208
+
100
209
  async def _snapshot_tasks():
101
210
  cur = asyncio.current_task()
102
211
  # `all_tasks` returns tasks for the current running loop only
@@ -115,6 +224,20 @@ def _gather_timeout() -> float:
115
224
  )
116
225
 
117
226
 
227
+ def filter_duplicate_results(
228
+ main_result: TestResult, results: List[TestResult]
229
+ ) -> List[TestResult]:
230
+ return [
231
+ result
232
+ for result in results
233
+ if not (
234
+ (result.input == main_result.input)
235
+ and (result.actual_output == main_result.actual_output)
236
+ and (result.metrics_data == main_result.metrics_data)
237
+ )
238
+ ]
239
+
240
+
118
241
  ###########################################
119
242
  ### E2E Evals #############################
120
243
  ###########################################
@@ -380,7 +503,10 @@ async def a_execute_test_cases(
380
503
 
381
504
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
382
505
  async with semaphore:
383
- return await func(*args, **kwargs)
506
+ return await asyncio.wait_for(
507
+ func(*args, **kwargs),
508
+ timeout=_per_task_timeout(),
509
+ )
384
510
 
385
511
  global_test_run_cache_manager.disable_write_cache = (
386
512
  cache_config.write_cache is False
@@ -499,7 +625,20 @@ async def a_execute_test_cases(
499
625
  tasks.append(asyncio.create_task(task))
500
626
 
501
627
  await asyncio.sleep(async_config.throttle_value)
502
- await asyncio.gather(*tasks)
628
+
629
+ try:
630
+ await asyncio.wait_for(
631
+ asyncio.gather(*tasks),
632
+ timeout=_gather_timeout(),
633
+ )
634
+ except asyncio.TimeoutError:
635
+ # Cancel any still-pending tasks and drain them
636
+ for t in tasks:
637
+ if not t.done():
638
+ t.cancel()
639
+ await asyncio.gather(*tasks, return_exceptions=True)
640
+ raise
641
+
503
642
  else:
504
643
  for test_case in test_cases:
505
644
  with capture_evaluation_run("test case"):
@@ -572,7 +711,19 @@ async def a_execute_test_cases(
572
711
  tasks.append(asyncio.create_task(task))
573
712
 
574
713
  await asyncio.sleep(async_config.throttle_value)
575
- await asyncio.gather(*tasks)
714
+
715
+ try:
716
+ await asyncio.wait_for(
717
+ asyncio.gather(*tasks),
718
+ timeout=_gather_timeout(),
719
+ )
720
+ except asyncio.TimeoutError:
721
+ # Cancel any still-pending tasks and drain them
722
+ for t in tasks:
723
+ if not t.done():
724
+ t.cancel()
725
+ await asyncio.gather(*tasks, return_exceptions=True)
726
+ raise
576
727
 
577
728
  return test_results
578
729
 
@@ -847,6 +998,7 @@ def execute_agentic_test_cases(
847
998
  _progress=progress,
848
999
  _pbar_callback_id=pbar_tags_id,
849
1000
  ):
1001
+
850
1002
  if asyncio.iscoroutinefunction(observed_callback):
851
1003
  loop = get_or_create_event_loop()
852
1004
  coro = observed_callback(golden.input)
@@ -898,10 +1050,11 @@ def execute_agentic_test_cases(
898
1050
  pbar_eval_id: Optional[int] = None,
899
1051
  ):
900
1052
  # Create API Span
901
- metrics: List[BaseMetric] = span.metrics
1053
+ metrics: List[BaseMetric] = list(span.metrics or [])
902
1054
  api_span: BaseApiSpan = (
903
1055
  trace_manager._convert_span_to_api_span(span)
904
1056
  )
1057
+
905
1058
  if isinstance(span, AgentSpan):
906
1059
  trace_api.agent_spans.append(api_span)
907
1060
  elif isinstance(span, LlmSpan):
@@ -914,14 +1067,27 @@ def execute_agentic_test_cases(
914
1067
  else:
915
1068
  trace_api.base_spans.append(api_span)
916
1069
 
1070
+ # Skip errored trace/span
1071
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1072
+ api_span.status = TraceSpanApiStatus.ERRORED
1073
+ api_span.error = span.error or _trace_error(
1074
+ current_trace
1075
+ )
1076
+ if progress and pbar_eval_id is not None:
1077
+ update_pbar(
1078
+ progress,
1079
+ pbar_eval_id,
1080
+ advance=count_metrics_in_span_subtree(span),
1081
+ )
1082
+ return
1083
+
917
1084
  for child in span.children:
918
1085
  dfs(child, progress, pbar_eval_id)
919
1086
 
920
- if span.metrics is None:
1087
+ if not span.metrics:
921
1088
  return
922
- has_task_completion = any(
923
- isinstance(metric, TaskCompletionMetric)
924
- for metric in span.metrics
1089
+ requires_trace = any(
1090
+ metric.requires_trace for metric in span.metrics
925
1091
  )
926
1092
 
927
1093
  llm_test_case = None
@@ -939,18 +1105,30 @@ def execute_agentic_test_cases(
939
1105
  tools_called=span.tools_called,
940
1106
  expected_tools=span.expected_tools,
941
1107
  )
942
- if llm_test_case is None and not has_task_completion:
943
- raise ValueError(
944
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
945
- )
946
1108
 
947
1109
  # add trace if task completion
948
- if has_task_completion:
1110
+ if requires_trace:
949
1111
  if llm_test_case is None:
950
1112
  llm_test_case = LLMTestCase(input="None")
951
1113
  llm_test_case._trace_dict = (
952
1114
  trace_manager.create_nested_spans_dict(span)
953
1115
  )
1116
+ else:
1117
+ if llm_test_case is None:
1118
+ api_span.status = TraceSpanApiStatus.ERRORED
1119
+ api_span.error = format_error_text(
1120
+ DeepEvalError(
1121
+ "Span has metrics but no LLMTestCase. "
1122
+ "Are you sure you called `update_current_span()`?"
1123
+ )
1124
+ )
1125
+ if progress and pbar_eval_id is not None:
1126
+ update_pbar(
1127
+ progress,
1128
+ pbar_eval_id,
1129
+ advance=count_metrics_in_span_subtree(span),
1130
+ )
1131
+ return
954
1132
 
955
1133
  # Preparing metric calculation
956
1134
  api_span.metrics_data = []
@@ -989,72 +1167,111 @@ def execute_agentic_test_cases(
989
1167
 
990
1168
  start_time = time.perf_counter()
991
1169
 
1170
+ skip_metrics_for_this_golden = False
992
1171
  # Handle trace-level metrics
993
- if current_trace.metrics:
994
- has_task_completion = any(
995
- isinstance(metric, TaskCompletionMetric)
996
- for metric in current_trace.metrics
997
- )
998
-
999
- llm_test_case = None
1000
- if current_trace.input:
1001
- llm_test_case = LLMTestCase(
1002
- input=str(current_trace.input),
1003
- actual_output=(
1004
- str(current_trace.output)
1005
- if current_trace.output is not None
1006
- else None
1172
+ if _skip_metrics_for_error(trace=current_trace):
1173
+ trace_api.status = TraceSpanApiStatus.ERRORED
1174
+ if progress and pbar_eval_id is not None:
1175
+ update_pbar(
1176
+ progress,
1177
+ pbar_eval_id,
1178
+ advance=count_total_metrics_for_trace(
1179
+ current_trace
1007
1180
  ),
1008
- expected_output=current_trace.expected_output,
1009
- context=current_trace.context,
1010
- retrieval_context=current_trace.retrieval_context,
1011
- tools_called=current_trace.tools_called,
1012
- expected_tools=current_trace.expected_tools,
1013
1181
  )
1014
- if llm_test_case is None and not has_task_completion:
1015
- raise ValueError(
1016
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1182
+ else:
1183
+ if current_trace.metrics:
1184
+ requires_trace = any(
1185
+ metric.requires_trace
1186
+ for metric in current_trace.metrics
1017
1187
  )
1018
1188
 
1019
- if has_task_completion:
1020
- if llm_test_case is None:
1021
- llm_test_case = LLMTestCase(input="None")
1022
- llm_test_case._trace_dict = (
1023
- trace_manager.create_nested_spans_dict(
1024
- current_trace.root_spans[0]
1189
+ llm_test_case = None
1190
+ if current_trace.input:
1191
+ llm_test_case = LLMTestCase(
1192
+ input=str(current_trace.input),
1193
+ actual_output=(
1194
+ str(current_trace.output)
1195
+ if current_trace.output is not None
1196
+ else None
1197
+ ),
1198
+ expected_output=current_trace.expected_output,
1199
+ context=current_trace.context,
1200
+ retrieval_context=current_trace.retrieval_context,
1201
+ tools_called=current_trace.tools_called,
1202
+ expected_tools=current_trace.expected_tools,
1025
1203
  )
1026
- )
1027
-
1028
- for metric in current_trace.metrics:
1029
- metric.skipped = False
1030
- metric.error = None
1031
- if display_config.verbose_mode is not None:
1032
- metric.verbose_mode = display_config.verbose_mode
1033
-
1034
- trace_api.metrics_data = []
1035
- for metric in current_trace.metrics:
1036
- res = _execute_metric(
1037
- metric=metric,
1038
- test_case=llm_test_case,
1039
- show_metric_indicator=show_metric_indicator,
1040
- in_component=True,
1041
- error_config=error_config,
1042
- )
1043
- if res == "skip":
1044
- continue
1045
-
1046
- if not metric.skipped:
1047
- metric_data = create_metric_data(metric)
1048
- trace_api.metrics_data.append(metric_data)
1049
- api_test_case.update_metric_data(metric_data)
1050
- api_test_case.update_status(metric_data.success)
1051
- update_pbar(progress, pbar_eval_id)
1204
+ if requires_trace:
1205
+ if llm_test_case is None:
1206
+ llm_test_case = LLMTestCase(input="None")
1207
+ llm_test_case._trace_dict = (
1208
+ trace_manager.create_nested_spans_dict(
1209
+ current_trace.root_spans[0]
1210
+ )
1211
+ )
1212
+ else:
1213
+ if llm_test_case is None:
1214
+ current_trace.status = TraceSpanStatus.ERRORED
1215
+ trace_api.status = TraceSpanApiStatus.ERRORED
1216
+ if current_trace.root_spans:
1217
+ current_trace.root_spans[0].status = (
1218
+ TraceSpanStatus.ERRORED
1219
+ )
1220
+ current_trace.root_spans[0].error = (
1221
+ format_error_text(
1222
+ DeepEvalError(
1223
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1224
+ "Are you sure you called `update_current_trace()`?"
1225
+ )
1226
+ )
1227
+ )
1228
+ if progress and pbar_eval_id is not None:
1229
+ update_pbar(
1230
+ progress,
1231
+ pbar_eval_id,
1232
+ advance=count_total_metrics_for_trace(
1233
+ current_trace
1234
+ ),
1235
+ )
1236
+ skip_metrics_for_this_golden = True
1237
+
1238
+ if not skip_metrics_for_this_golden:
1239
+ for metric in current_trace.metrics:
1240
+ metric.skipped = False
1241
+ metric.error = None
1242
+ if display_config.verbose_mode is not None:
1243
+ metric.verbose_mode = (
1244
+ display_config.verbose_mode
1245
+ )
1246
+
1247
+ trace_api.metrics_data = []
1248
+ for metric in current_trace.metrics:
1249
+ res = _execute_metric(
1250
+ metric=metric,
1251
+ test_case=llm_test_case,
1252
+ show_metric_indicator=show_metric_indicator,
1253
+ in_component=True,
1254
+ error_config=error_config,
1255
+ )
1256
+ if res == "skip":
1257
+ continue
1258
+
1259
+ if not metric.skipped:
1260
+ metric_data = create_metric_data(metric)
1261
+ trace_api.metrics_data.append(metric_data)
1262
+ api_test_case.update_metric_data(
1263
+ metric_data
1264
+ )
1265
+ api_test_case.update_status(
1266
+ metric_data.success
1267
+ )
1268
+ update_pbar(progress, pbar_eval_id)
1269
+
1270
+ # Then handle span-level metrics
1271
+ dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1052
1272
 
1053
- # Then handle span-level metrics
1054
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1055
1273
  end_time = time.perf_counter()
1056
1274
  run_duration = end_time - start_time
1057
-
1058
1275
  # Update test run
1059
1276
  api_test_case.update_run_duration(run_duration)
1060
1277
  test_run_manager.update_test_run(api_test_case, test_case)
@@ -1102,7 +1319,10 @@ async def a_execute_agentic_test_cases(
1102
1319
 
1103
1320
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
1104
1321
  async with semaphore:
1105
- return await func(*args, **kwargs)
1322
+ return await asyncio.wait_for(
1323
+ func(*args, **kwargs),
1324
+ timeout=_per_task_timeout(),
1325
+ )
1106
1326
 
1107
1327
  test_run_manager = global_test_run_manager
1108
1328
  test_run_manager.save_to_disk = cache_config.write_cache
@@ -1149,7 +1369,19 @@ async def a_execute_agentic_test_cases(
1149
1369
  tasks.append(asyncio.create_task(task))
1150
1370
  await asyncio.sleep(async_config.throttle_value)
1151
1371
 
1152
- await asyncio.gather(*tasks)
1372
+ try:
1373
+ await asyncio.wait_for(
1374
+ asyncio.gather(*tasks),
1375
+ timeout=_gather_timeout(),
1376
+ )
1377
+ except asyncio.TimeoutError:
1378
+ # Cancel any still-pending tasks and drain them
1379
+ for t in tasks:
1380
+ if not t.done():
1381
+ t.cancel()
1382
+ await asyncio.gather(*tasks, return_exceptions=True)
1383
+ raise
1384
+
1153
1385
  else:
1154
1386
  for golden in goldens:
1155
1387
  with capture_evaluation_run("golden"):
@@ -1266,7 +1498,7 @@ async def _a_execute_agentic_test_case(
1266
1498
  )
1267
1499
 
1268
1500
  await _a_execute_trace_test_case(
1269
- trace=trace,
1501
+ trace=current_trace,
1270
1502
  trace_api=trace_api,
1271
1503
  api_test_case=api_test_case,
1272
1504
  ignore_errors=ignore_errors,
@@ -1278,9 +1510,10 @@ async def _a_execute_agentic_test_case(
1278
1510
  _use_bar_indicator=_use_bar_indicator,
1279
1511
  )
1280
1512
 
1281
- async def dfs(span: BaseSpan):
1513
+ async def dfs(trace: Trace, span: BaseSpan):
1282
1514
  await _a_execute_span_test_case(
1283
1515
  span=span,
1516
+ current_trace=trace,
1284
1517
  trace_api=trace_api,
1285
1518
  api_test_case=api_test_case,
1286
1519
  ignore_errors=ignore_errors,
@@ -1292,36 +1525,58 @@ async def _a_execute_agentic_test_case(
1292
1525
  test_run_manager=test_run_manager,
1293
1526
  _use_bar_indicator=_use_bar_indicator,
1294
1527
  )
1295
- child_tasks = [dfs(child) for child in span.children]
1528
+
1529
+ if _skip_metrics_for_error(span=span, trace=trace):
1530
+ return
1531
+
1532
+ child_tasks = [
1533
+ asyncio.create_task(dfs(trace, child)) for child in span.children
1534
+ ]
1296
1535
  if child_tasks:
1297
- await asyncio.gather(*child_tasks)
1536
+ try:
1537
+ await asyncio.wait_for(
1538
+ asyncio.gather(*child_tasks),
1539
+ timeout=_gather_timeout(),
1540
+ )
1541
+ except asyncio.TimeoutError:
1542
+ for t in child_tasks:
1543
+ if not t.done():
1544
+ t.cancel()
1545
+ await asyncio.gather(*child_tasks, return_exceptions=True)
1546
+ raise
1298
1547
 
1299
1548
  test_start_time = time.perf_counter()
1300
- if current_trace and current_trace.root_spans:
1301
- await dfs(current_trace.root_spans[0])
1302
- else:
1303
- if (
1304
- logger.isEnabledFor(logging.DEBUG)
1305
- and get_settings().DEEPEVAL_VERBOSE_MODE
1306
- ):
1307
- logger.debug(
1308
- "Skipping DFS: empty trace or no root spans (trace=%s)",
1309
- current_trace.uuid if current_trace else None,
1310
- )
1549
+
1550
+ if not _skip_metrics_for_error(trace=current_trace):
1551
+ if current_trace and current_trace.root_spans:
1552
+ await dfs(current_trace, current_trace.root_spans[0])
1553
+ else:
1554
+ if (
1555
+ logger.isEnabledFor(logging.DEBUG)
1556
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1557
+ ):
1558
+ logger.debug(
1559
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1560
+ current_trace.uuid if current_trace else None,
1561
+ )
1311
1562
 
1312
1563
  test_end_time = time.perf_counter()
1313
1564
  run_duration = test_end_time - test_start_time
1314
1565
 
1315
1566
  api_test_case.update_run_duration(run_duration)
1316
1567
  test_run_manager.update_test_run(api_test_case, test_case)
1317
- test_results.append(create_test_result(api_test_case))
1318
- test_results.extend(extract_trace_test_results(trace_api))
1568
+ main_result = create_test_result(api_test_case)
1569
+ trace_results = extract_trace_test_results(trace_api)
1570
+ unique_trace_results = filter_duplicate_results(main_result, trace_results)
1571
+ test_results.append(main_result)
1572
+ test_results.extend(unique_trace_results)
1319
1573
 
1320
1574
  update_pbar(progress, pbar_id)
1321
1575
 
1322
1576
 
1323
1577
  async def _a_execute_span_test_case(
1324
1578
  span: BaseSpan,
1579
+ current_trace: Trace,
1325
1580
  trace_api: TraceApi,
1326
1581
  api_test_case: LLMApiTestCase,
1327
1582
  ignore_errors: bool,
@@ -1346,12 +1601,22 @@ async def _a_execute_span_test_case(
1346
1601
  else:
1347
1602
  trace_api.base_spans.append(api_span)
1348
1603
 
1349
- if span.metrics is None:
1604
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1605
+ api_span.status = TraceSpanApiStatus.ERRORED
1606
+ api_span.error = span.error or _trace_error(current_trace)
1607
+ if progress and pbar_eval_id is not None:
1608
+ update_pbar(
1609
+ progress,
1610
+ pbar_eval_id,
1611
+ advance=count_metrics_in_span_subtree(span),
1612
+ )
1350
1613
  return
1351
1614
 
1352
- has_task_completion = any(
1353
- isinstance(metric, TaskCompletionMetric) for metric in span.metrics
1354
- )
1615
+ metrics: List[BaseMetric] = list(span.metrics or [])
1616
+ if not metrics:
1617
+ return
1618
+
1619
+ requires_trace = any(metric.requires_trace for metric in metrics)
1355
1620
 
1356
1621
  llm_test_case = None
1357
1622
  if span.input:
@@ -1364,17 +1629,29 @@ async def _a_execute_span_test_case(
1364
1629
  tools_called=span.tools_called,
1365
1630
  expected_tools=span.expected_tools,
1366
1631
  )
1367
- if llm_test_case is None and not has_task_completion:
1368
- raise ValueError(
1369
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
1370
- )
1632
+
1633
+ if not requires_trace:
1634
+ if llm_test_case is None:
1635
+ api_span.status = TraceSpanApiStatus.ERRORED
1636
+ api_span.error = format_error_text(
1637
+ DeepEvalError(
1638
+ "Span has metrics but no LLMTestCase. "
1639
+ "Are you sure you called `update_current_span()`?"
1640
+ )
1641
+ )
1642
+ if progress and pbar_eval_id is not None:
1643
+ update_pbar(
1644
+ progress,
1645
+ pbar_eval_id,
1646
+ advance=count_metrics_in_span_subtree(span),
1647
+ )
1648
+ return
1371
1649
 
1372
1650
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1373
- metrics: List[BaseMetric] = span.metrics
1374
1651
  test_case: Optional[LLMTestCase] = llm_test_case
1375
1652
 
1376
1653
  # add trace if task completion
1377
- if has_task_completion:
1654
+ if requires_trace:
1378
1655
  if test_case is None:
1379
1656
  test_case = LLMTestCase(input="None")
1380
1657
  test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
@@ -1418,12 +1695,22 @@ async def _a_execute_trace_test_case(
1418
1695
  pbar_eval_id: Optional[int],
1419
1696
  _use_bar_indicator: bool,
1420
1697
  ):
1421
- if trace.metrics is None:
1698
+
1699
+ if _skip_metrics_for_error(trace=trace):
1700
+ trace_api.status = TraceSpanApiStatus.ERRORED
1701
+ if progress and pbar_eval_id is not None:
1702
+ update_pbar(
1703
+ progress,
1704
+ pbar_eval_id,
1705
+ advance=count_total_metrics_for_trace(trace),
1706
+ )
1422
1707
  return
1423
1708
 
1424
- has_task_completion = any(
1425
- isinstance(metric, TaskCompletionMetric) for metric in trace.metrics
1426
- )
1709
+ metrics: List[BaseMetric] = list(trace.metrics or [])
1710
+ if not metrics:
1711
+ return
1712
+
1713
+ requires_trace = any(metric.requires_trace for metric in metrics)
1427
1714
 
1428
1715
  llm_test_case = None
1429
1716
  if trace.input:
@@ -1438,17 +1725,32 @@ async def _a_execute_trace_test_case(
1438
1725
  tools_called=trace.tools_called,
1439
1726
  expected_tools=trace.expected_tools,
1440
1727
  )
1441
- if llm_test_case is None and not has_task_completion:
1442
- raise ValueError(
1443
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1444
- )
1728
+
1729
+ if not requires_trace:
1730
+ if llm_test_case is None:
1731
+ trace.status = TraceSpanStatus.ERRORED
1732
+ trace_api.status = TraceSpanApiStatus.ERRORED
1733
+ if trace.root_spans:
1734
+ trace.root_spans[0].status = TraceSpanStatus.ERRORED
1735
+ trace.root_spans[0].error = format_error_text(
1736
+ DeepEvalError(
1737
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1738
+ "Are you sure you called `update_current_trace()`?"
1739
+ )
1740
+ )
1741
+ if progress and pbar_eval_id is not None:
1742
+ update_pbar(
1743
+ progress,
1744
+ pbar_eval_id,
1745
+ advance=count_total_metrics_for_trace(trace),
1746
+ )
1747
+ return
1445
1748
 
1446
1749
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1447
- metrics: List[BaseMetric] = trace.metrics
1448
1750
  test_case: Optional[LLMTestCase] = llm_test_case
1449
1751
 
1450
1752
  # add trace if task completion
1451
- if has_task_completion:
1753
+ if requires_trace:
1452
1754
  if test_case is None:
1453
1755
  test_case = LLMTestCase(input="None")
1454
1756
  test_case._trace_dict = trace_manager.create_nested_spans_dict(
@@ -1578,11 +1880,12 @@ def execute_agentic_test_cases_from_loop(
1578
1880
  pbar_eval_id: Optional[int] = None,
1579
1881
  ):
1580
1882
  # Create API Span
1581
- metrics: List[BaseMetric] = span.metrics
1883
+ metrics: List[BaseMetric] = list(span.metrics or [])
1582
1884
 
1583
1885
  api_span: BaseApiSpan = (
1584
1886
  trace_manager._convert_span_to_api_span(span)
1585
1887
  )
1888
+
1586
1889
  if isinstance(span, AgentSpan):
1587
1890
  trace_api.agent_spans.append(api_span)
1588
1891
  elif isinstance(span, LlmSpan):
@@ -1595,9 +1898,30 @@ def execute_agentic_test_cases_from_loop(
1595
1898
  else:
1596
1899
  trace_api.base_spans.append(api_span)
1597
1900
 
1901
+ # Skip errored trace/span
1902
+ if _skip_metrics_for_error(span=span, trace=current_trace):
1903
+ api_span.status = TraceSpanApiStatus.ERRORED
1904
+ api_span.error = span.error or _trace_error(
1905
+ current_trace
1906
+ )
1907
+ if progress and pbar_eval_id is not None:
1908
+ update_pbar(
1909
+ progress,
1910
+ pbar_eval_id,
1911
+ advance=count_metrics_in_span_subtree(span),
1912
+ )
1913
+ return
1914
+
1598
1915
  for child in span.children:
1599
1916
  dfs(child, progress, pbar_eval_id)
1600
1917
 
1918
+ if not span.metrics:
1919
+ return
1920
+
1921
+ requires_trace = any(
1922
+ metric.requires_trace for metric in metrics
1923
+ )
1924
+
1601
1925
  llm_test_case = None
1602
1926
  if span.input is not None:
1603
1927
  llm_test_case = LLMTestCase(
@@ -1613,20 +1937,29 @@ def execute_agentic_test_cases_from_loop(
1613
1937
  tools_called=span.tools_called,
1614
1938
  expected_tools=span.expected_tools,
1615
1939
  )
1616
- if span.metrics is None or llm_test_case is None:
1617
- return
1618
-
1619
- has_task_completion = any(
1620
- isinstance(metric, TaskCompletionMetric)
1621
- for metric in metrics
1622
- )
1623
1940
 
1624
- if has_task_completion:
1941
+ if requires_trace:
1625
1942
  if llm_test_case is None:
1626
1943
  llm_test_case = LLMTestCase(input="None")
1627
1944
  llm_test_case._trace_dict = (
1628
1945
  trace_manager.create_nested_spans_dict(span)
1629
1946
  )
1947
+ else:
1948
+ if llm_test_case is None:
1949
+ api_span.status = TraceSpanApiStatus.ERRORED
1950
+ api_span.error = format_error_text(
1951
+ DeepEvalError(
1952
+ "Span has metrics but no LLMTestCase. "
1953
+ "Are you sure you called `update_current_span()`?"
1954
+ )
1955
+ )
1956
+ if progress and pbar_eval_id is not None:
1957
+ update_pbar(
1958
+ progress,
1959
+ pbar_eval_id,
1960
+ advance=count_metrics_in_span_subtree(span),
1961
+ )
1962
+ return
1630
1963
 
1631
1964
  # Preparing metric calculation
1632
1965
  api_span.metrics_data = []
@@ -1670,77 +2003,123 @@ def execute_agentic_test_cases_from_loop(
1670
2003
  start_time = time.perf_counter()
1671
2004
 
1672
2005
  # Handle trace-level metrics
1673
- if current_trace.metrics:
1674
- has_task_completion = any(
1675
- isinstance(metric, TaskCompletionMetric)
1676
- for metric in current_trace.metrics
1677
- )
1678
-
1679
- llm_test_case = None
1680
- if current_trace.input:
1681
- llm_test_case = LLMTestCase(
1682
- input=str(current_trace.input),
1683
- actual_output=(
1684
- str(current_trace.output)
1685
- if current_trace.output is not None
1686
- else None
2006
+ skip_metrics_for_this_golden = False
2007
+ if _skip_metrics_for_error(trace=current_trace):
2008
+ trace_api.status = TraceSpanApiStatus.ERRORED
2009
+ if progress and pbar_eval_id is not None:
2010
+ update_pbar(
2011
+ progress,
2012
+ pbar_eval_id,
2013
+ advance=count_total_metrics_for_trace(
2014
+ current_trace
1687
2015
  ),
1688
- expected_output=current_trace.expected_output,
1689
- context=current_trace.context,
1690
- retrieval_context=current_trace.retrieval_context,
1691
- tools_called=current_trace.tools_called,
1692
- expected_tools=current_trace.expected_tools,
1693
2016
  )
1694
- if llm_test_case is None and not has_task_completion:
1695
- raise ValueError(
1696
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
2017
+ else:
2018
+ if current_trace.metrics:
2019
+ requires_trace = any(
2020
+ metric.requires_trace
2021
+ for metric in current_trace.metrics
1697
2022
  )
1698
2023
 
1699
- if has_task_completion:
1700
- if llm_test_case is None:
1701
- llm_test_case = LLMTestCase(input="None")
1702
- llm_test_case._trace_dict = (
1703
- trace_manager.create_nested_spans_dict(
1704
- current_trace.root_spans[0]
2024
+ llm_test_case = None
2025
+ if current_trace.input:
2026
+ llm_test_case = LLMTestCase(
2027
+ input=str(current_trace.input),
2028
+ actual_output=(
2029
+ str(current_trace.output)
2030
+ if current_trace.output is not None
2031
+ else None
2032
+ ),
2033
+ expected_output=current_trace.expected_output,
2034
+ context=current_trace.context,
2035
+ retrieval_context=current_trace.retrieval_context,
2036
+ tools_called=current_trace.tools_called,
2037
+ expected_tools=current_trace.expected_tools,
1705
2038
  )
1706
- )
1707
-
1708
- for metric in current_trace.metrics:
1709
- metric.skipped = False
1710
- metric.error = None
1711
- if display_config.verbose_mode is not None:
1712
- metric.verbose_mode = display_config.verbose_mode
1713
-
1714
- trace_api.metrics_data = []
1715
- for metric in current_trace.metrics:
1716
- res = _execute_metric(
1717
- metric=metric,
1718
- test_case=llm_test_case,
1719
- show_metric_indicator=show_metric_indicator,
1720
- in_component=True,
1721
- error_config=error_config,
1722
- )
1723
- if res == "skip":
1724
- continue
1725
-
1726
- if not metric.skipped:
1727
- metric_data = create_metric_data(metric)
1728
- trace_api.metrics_data.append(metric_data)
1729
- api_test_case.update_metric_data(metric_data)
1730
- api_test_case.update_status(metric_data.success)
1731
- update_pbar(progress, pbar_eval_id)
1732
-
1733
- # Then handle span-level metrics
1734
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1735
- end_time = time.perf_counter()
1736
- run_duration = end_time - start_time
1737
2039
 
1738
- # Update test run
1739
- api_test_case.update_run_duration(run_duration)
1740
- test_run_manager.update_test_run(api_test_case, test_case)
1741
- test_results.append(create_test_result(api_test_case))
2040
+ if requires_trace:
2041
+ if llm_test_case is None:
2042
+ llm_test_case = LLMTestCase(input="None")
2043
+ llm_test_case._trace_dict = (
2044
+ trace_manager.create_nested_spans_dict(
2045
+ current_trace.root_spans[0]
2046
+ )
2047
+ )
2048
+ else:
2049
+ if llm_test_case is None:
2050
+ current_trace.status = TraceSpanStatus.ERRORED
2051
+ trace_api.status = TraceSpanApiStatus.ERRORED
2052
+ if current_trace.root_spans:
2053
+ current_trace.root_spans[0].status = (
2054
+ TraceSpanStatus.ERRORED
2055
+ )
2056
+ current_trace.root_spans[0].error = (
2057
+ format_error_text(
2058
+ DeepEvalError(
2059
+ "Trace has metrics but no LLMTestCase (missing input/output). "
2060
+ "Are you sure you called `update_current_trace()`?"
2061
+ )
2062
+ )
2063
+ )
2064
+ if progress and pbar_eval_id is not None:
2065
+ update_pbar(
2066
+ progress,
2067
+ pbar_eval_id,
2068
+ advance=count_total_metrics_for_trace(
2069
+ current_trace
2070
+ ),
2071
+ )
2072
+ skip_metrics_for_this_golden = True
2073
+
2074
+ if not skip_metrics_for_this_golden:
2075
+ for metric in current_trace.metrics:
2076
+ metric.skipped = False
2077
+ metric.error = None
2078
+ if display_config.verbose_mode is not None:
2079
+ metric.verbose_mode = (
2080
+ display_config.verbose_mode
2081
+ )
2082
+
2083
+ trace_api.metrics_data = []
2084
+ for metric in current_trace.metrics:
2085
+ res = _execute_metric(
2086
+ metric=metric,
2087
+ test_case=llm_test_case,
2088
+ show_metric_indicator=show_metric_indicator,
2089
+ in_component=True,
2090
+ error_config=error_config,
2091
+ )
2092
+ if res == "skip":
2093
+ continue
2094
+
2095
+ if not metric.skipped:
2096
+ metric_data = create_metric_data(metric)
2097
+ trace_api.metrics_data.append(metric_data)
2098
+ api_test_case.update_metric_data(
2099
+ metric_data
2100
+ )
2101
+ api_test_case.update_status(
2102
+ metric_data.success
2103
+ )
2104
+ update_pbar(progress, pbar_eval_id)
2105
+
2106
+ # Then handle span-level metrics
2107
+ dfs(current_trace.root_spans[0], progress, pbar_eval_id)
2108
+
2109
+ end_time = time.perf_counter()
2110
+ run_duration = end_time - start_time
2111
+ # Update test run
2112
+ api_test_case.update_run_duration(run_duration)
2113
+ test_run_manager.update_test_run(api_test_case, test_case)
2114
+ main_result = create_test_result(api_test_case)
2115
+ trace_results = extract_trace_test_results(trace_api)
2116
+ unique_trace_results = filter_duplicate_results(
2117
+ main_result, trace_results
2118
+ )
2119
+ test_results.append(main_result)
2120
+ test_results.extend(unique_trace_results)
1742
2121
 
1743
- update_pbar(progress, pbar_id)
2122
+ update_pbar(progress, pbar_id)
1744
2123
 
1745
2124
  try:
1746
2125
  if display_config.show_indicator and _use_bar_indicator:
@@ -1841,39 +2220,137 @@ def a_execute_agentic_test_cases_from_loop(
1841
2220
  }
1842
2221
 
1843
2222
  def on_task_done(t: asyncio.Task):
2223
+ cancelled = False
2224
+ exc = None
2225
+ trace = None
2226
+ root = None
2227
+ resolved_trace_from_task = False
2228
+ resolved_root_from_task = False
2229
+
2230
+ # Task.exception() raises CancelledError if task was cancelled
2231
+ try:
2232
+ exc = t.exception()
2233
+ except asyncio.CancelledError:
2234
+ cancelled = True
2235
+ exc = None
2236
+
2237
+ meta = task_meta.get(t, {})
2238
+ golden_index = meta.get("golden_index")
2239
+
2240
+ if golden_index is not None and 0 <= golden_index < len(
2241
+ goldens
2242
+ ):
2243
+ golden = goldens[golden_index]
2244
+
2245
+ def _mark_trace_error(trace, root, msg: str):
2246
+ now = time.perf_counter()
2247
+ trace.status = TraceSpanStatus.ERRORED
2248
+ # Close the trace so the API layer has a proper endTime
2249
+ if trace.end_time is None:
2250
+ trace.end_time = now
2251
+ if root:
2252
+ root.status = TraceSpanStatus.ERRORED
2253
+ root.error = msg
2254
+ if root.end_time is None:
2255
+ root.end_time = now
2256
+
2257
+ if exc is not None:
2258
+ msg = format_error_text(exc)
2259
+ trace, root = _resolve_trace_and_root_for_task(t)
2260
+ resolved_trace_from_task = bool(trace)
2261
+ resolved_root_from_task = bool(root)
2262
+ if trace:
2263
+ _mark_trace_error(trace, root, msg)
2264
+ else:
2265
+ for (
2266
+ trace
2267
+ ) in trace_manager.integration_traces_to_evaluate:
2268
+ if (
2269
+ trace_manager.trace_uuid_to_golden.get(
2270
+ trace.uuid
2271
+ )
2272
+ is golden
2273
+ ):
2274
+ root = _pick_root_for_marking(trace)
2275
+ _mark_trace_error(trace, root, msg)
2276
+ break
2277
+
2278
+ elif cancelled or t.cancelled():
2279
+ cancel_exc = DeepEvalError(
2280
+ "Task was cancelled (likely due to timeout)."
2281
+ )
2282
+ msg = format_error_text(cancel_exc)
2283
+ trace, root = _resolve_trace_and_root_for_task(t)
2284
+ resolved_trace_from_task = bool(trace)
2285
+ resolved_root_from_task = bool(root)
2286
+ if trace:
2287
+ _mark_trace_error(trace, root, msg)
2288
+ else:
2289
+ for (
2290
+ trace
2291
+ ) in trace_manager.integration_traces_to_evaluate:
2292
+ if (
2293
+ trace_manager.trace_uuid_to_golden.get(
2294
+ trace.uuid
2295
+ )
2296
+ is golden
2297
+ ):
2298
+ root = _pick_root_for_marking(trace)
2299
+ _mark_trace_error(trace, root, msg)
2300
+ break
2301
+
1844
2302
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1845
2303
  # Using info level here to make it easy to spot these logs.
1846
- # We are gated by DEEPEVAL_DEBUG_ASYNC
1847
- meta = task_meta.get(t, {})
2304
+ golden_name = meta.get("golden_name")
1848
2305
  duration = time.perf_counter() - meta.get(
1849
2306
  "started", started
1850
2307
  )
1851
2308
 
1852
- if t.cancelled():
2309
+ if cancelled or exc is not None:
2310
+ if not resolved_trace_from_task:
2311
+ logger.warning(
2312
+ "[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
2313
+ t.get_name(),
2314
+ golden_name,
2315
+ )
2316
+ elif not resolved_root_from_task:
2317
+ logger.warning(
2318
+ "[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
2319
+ t.get_name(),
2320
+ trace.uuid,
2321
+ )
2322
+
2323
+ if cancelled:
1853
2324
  logger.info(
1854
2325
  "[deepeval] task CANCELLED %s after %.2fs meta=%r",
1855
2326
  t.get_name(),
1856
2327
  duration,
1857
2328
  meta,
1858
2329
  )
2330
+ elif exc is not None:
2331
+ logger.error(
2332
+ "[deepeval] task ERROR %s after %.2fs meta=%r",
2333
+ t.get_name(),
2334
+ duration,
2335
+ meta,
2336
+ exc_info=(
2337
+ type(exc),
2338
+ exc,
2339
+ getattr(exc, "__traceback__", None),
2340
+ ),
2341
+ )
1859
2342
  else:
1860
- exc = t.exception()
1861
- if exc is not None:
1862
- logger.error(
1863
- "[deepeval] task ERROR %s after %.2fs meta=%r",
1864
- t.get_name(),
1865
- duration,
1866
- meta,
1867
- exc_info=(type(exc), exc, exc.__traceback__),
1868
- )
1869
- else:
1870
- logger.info(
1871
- "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
1872
- t.get_name(),
1873
- duration,
1874
- meta.get("golden_index"),
1875
- )
2343
+ logger.info(
2344
+ "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
2345
+ t.get_name(),
2346
+ duration,
2347
+ meta.get("golden_index"),
2348
+ )
1876
2349
 
2350
+ try:
2351
+ trace_manager.task_bindings.pop(t, None)
2352
+ except Exception:
2353
+ pass
1877
2354
  update_pbar(progress, pbar_callback_id)
1878
2355
  update_pbar(progress, pbar_id)
1879
2356
 
@@ -1918,6 +2395,7 @@ def a_execute_agentic_test_cases_from_loop(
1918
2395
  timeout=_gather_timeout(),
1919
2396
  )
1920
2397
  )
2398
+
1921
2399
  except asyncio.TimeoutError:
1922
2400
  import traceback
1923
2401
 
@@ -1987,10 +2465,11 @@ def a_execute_agentic_test_cases_from_loop(
1987
2465
  ]
1988
2466
 
1989
2467
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1990
- logger.warning(
1991
- "[deepeval] %d stray task(s) not tracked; cancelling...",
1992
- len(leftovers),
1993
- )
2468
+ if len(leftovers) > 0:
2469
+ logger.warning(
2470
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
2471
+ len(leftovers),
2472
+ )
1994
2473
  for t in leftovers:
1995
2474
  meta = task_meta.get(t, {})
1996
2475
  name = t.get_name()
@@ -2130,7 +2609,10 @@ async def _a_evaluate_traces(
2130
2609
 
2131
2610
  async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
2132
2611
  async with semaphore:
2133
- return await func(*args, **kwargs)
2612
+ return await asyncio.wait_for(
2613
+ func(*args, **kwargs),
2614
+ timeout=_per_task_timeout(),
2615
+ )
2134
2616
 
2135
2617
  eval_tasks = []
2136
2618
  # Here, we will work off a fixed-set copy to avoid surprises from potential
@@ -2173,7 +2655,18 @@ async def _a_evaluate_traces(
2173
2655
  )
2174
2656
  eval_tasks.append(asyncio.create_task(task))
2175
2657
  await asyncio.sleep(throttle_value)
2176
- await asyncio.gather(*eval_tasks)
2658
+
2659
+ try:
2660
+ await asyncio.wait_for(
2661
+ asyncio.gather(*eval_tasks),
2662
+ timeout=_gather_timeout(),
2663
+ )
2664
+ except asyncio.TimeoutError:
2665
+ for t in eval_tasks:
2666
+ if not t.done():
2667
+ t.cancel()
2668
+ await asyncio.gather(*eval_tasks, return_exceptions=True)
2669
+ raise
2177
2670
 
2178
2671
 
2179
2672
  async def _evaluate_test_case_pairs(
@@ -2196,7 +2689,10 @@ async def _evaluate_test_case_pairs(
2196
2689
 
2197
2690
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
2198
2691
  async with semaphore:
2199
- return await func(*args, **kwargs)
2692
+ return await asyncio.wait_for(
2693
+ func(*args, **kwargs),
2694
+ timeout=_per_task_timeout(),
2695
+ )
2200
2696
 
2201
2697
  tasks = []
2202
2698
  for count, test_case_pair in enumerate(test_case_pairs):
@@ -2229,7 +2725,19 @@ async def _evaluate_test_case_pairs(
2229
2725
  )
2230
2726
  tasks.append(asyncio.create_task(task))
2231
2727
  await asyncio.sleep(throttle_value)
2232
- await asyncio.gather(*tasks)
2728
+
2729
+ try:
2730
+ await asyncio.wait_for(
2731
+ asyncio.gather(*tasks),
2732
+ timeout=_gather_timeout(),
2733
+ )
2734
+ except asyncio.TimeoutError:
2735
+ # Cancel any still-pending tasks and drain them
2736
+ for t in tasks:
2737
+ if not t.done():
2738
+ t.cancel()
2739
+ await asyncio.gather(*tasks, return_exceptions=True)
2740
+ raise
2233
2741
 
2234
2742
 
2235
2743
  def _execute_metric(
@@ -2251,7 +2759,7 @@ def _execute_metric(
2251
2759
  return "skip"
2252
2760
  else:
2253
2761
  if error_config.ignore_errors:
2254
- metric.error = str(e)
2762
+ metric.error = format_error_text(e)
2255
2763
  metric.success = False
2256
2764
  else:
2257
2765
  raise
@@ -2263,19 +2771,19 @@ def _execute_metric(
2263
2771
  return "skip"
2264
2772
  else:
2265
2773
  if error_config.ignore_errors:
2266
- metric.error = str(e)
2774
+ metric.error = format_error_text(e)
2267
2775
  metric.success = False
2268
2776
  else:
2269
2777
  raise
2270
2778
  except Exception as e:
2271
2779
  if error_config.ignore_errors:
2272
- metric.error = str(e)
2780
+ metric.error = format_error_text(e)
2273
2781
  metric.success = False
2274
2782
  else:
2275
2783
  raise
2276
2784
  except Exception as e:
2277
2785
  if error_config.ignore_errors:
2278
- metric.error = str(e)
2786
+ metric.error = format_error_text(e)
2279
2787
  metric.success = False
2280
2788
  else:
2281
2789
  raise