deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
 
3
4
  from rich.progress import (
@@ -43,9 +44,14 @@ from deepeval.tracing.api import (
43
44
  )
44
45
  from deepeval.dataset import Golden
45
46
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
- from deepeval.errors import MissingTestCaseParamsError
47
+ from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
47
48
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import get_or_create_event_loop, shorten, len_medium
49
+ from deepeval.utils import (
50
+ get_or_create_event_loop,
51
+ shorten,
52
+ len_medium,
53
+ format_error_text,
54
+ )
49
55
  from deepeval.telemetry import capture_evaluation_run
50
56
  from deepeval.metrics import (
51
57
  BaseMetric,
@@ -56,6 +62,11 @@ from deepeval.metrics import (
56
62
  from deepeval.metrics.indicator import (
57
63
  measure_metrics_with_indicator,
58
64
  )
65
+ from deepeval.models.retry_policy import (
66
+ set_outer_deadline,
67
+ reset_outer_deadline,
68
+ run_sync_with_timeout,
69
+ )
59
70
  from deepeval.test_case import (
60
71
  LLMTestCase,
61
72
  ConversationalTestCase,
@@ -82,10 +93,13 @@ from deepeval.evaluate.utils import (
82
93
  create_metric_data,
83
94
  create_test_result,
84
95
  count_metrics_in_trace,
96
+ count_total_metrics_for_trace,
97
+ count_metrics_in_span_subtree,
85
98
  extract_trace_test_results,
86
99
  )
87
100
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
- from deepeval.tracing.types import TestCaseMetricPair
101
+ from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
102
+ from deepeval.tracing.api import TraceSpanApiStatus
89
103
  from deepeval.config.settings import get_settings
90
104
  from deepeval.test_run import TEMP_FILE_PATH
91
105
  from deepeval.confident.api import is_confident
@@ -97,6 +111,108 @@ from deepeval.test_run.hyperparameters import (
97
111
  logger = logging.getLogger(__name__)
98
112
 
99
113
 
114
+ def _skip_metrics_for_error(
115
+ span: Optional[BaseSpan] = None,
116
+ trace: Optional[Trace] = None,
117
+ ) -> bool:
118
+ # trace failure: skip everything under this trace
119
+ if trace is not None and trace.status == TraceSpanStatus.ERRORED:
120
+ return True
121
+ # span failure: skip this span’s metrics
122
+ if span is not None and span.status == TraceSpanStatus.ERRORED:
123
+ return True
124
+ return False
125
+
126
+
127
+ def _trace_error(current_trace: Trace) -> Optional[str]:
128
+ def _first_err(s: BaseSpan) -> Optional[str]:
129
+ if s.status == TraceSpanStatus.ERRORED and s.error:
130
+ return s.error
131
+ for c in s.children or []:
132
+ e = _first_err(c)
133
+ if e:
134
+ return e
135
+ return None
136
+
137
+ for root in current_trace.root_spans or []:
138
+ e = _first_err(root)
139
+ if e:
140
+ return e
141
+ return None
142
+
143
+
144
+ def _get_trace_by_uuid_anywhere(trace_uuid: str):
145
+ """
146
+ Resolver for a trace UUID across the manager's state.
147
+
148
+ First tries the manager's indexed lookup, which (covers active/in-flight traces,
149
+ then does a linear scan of the full `trace_manager.traces` list, which covers
150
+ traces that were recorded/closed earlier or not yet indexed. Returns
151
+ the concrete Trace object or None if not found.
152
+ """
153
+ tr = trace_manager.get_trace_by_uuid(trace_uuid)
154
+ if tr:
155
+ return tr
156
+ for tr in trace_manager.traces:
157
+ if tr.uuid == trace_uuid:
158
+ return tr
159
+ return None
160
+
161
+
162
+ def _pick_root_for_marking(trace):
163
+ """
164
+ Choose the most appropriate root span to annotate on error/cancel.
165
+
166
+ Heuristic:
167
+ - Prefer the most recent open root, which will have no `end_time` since this is the
168
+ span currently in flight.
169
+ - If none are open, use the last root span if it exists.
170
+ - If the trace has no roots, return None.
171
+
172
+ This favors marking the active root in multi root traces while remaining
173
+ stable for already closed traces.
174
+ """
175
+ open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
176
+ return (
177
+ open_roots[-1]
178
+ if open_roots
179
+ else (trace.root_spans[-1] if trace.root_spans else None)
180
+ )
181
+
182
+
183
+ def _resolve_trace_and_root_for_task(t: asyncio.Task):
184
+ """
185
+ Resolve trace and root for a completed task using the weak binding map.
186
+
187
+ Steps:
188
+ 1. Look up the task in `trace_manager.task_bindings` to get the
189
+ bound `trace_uuid` and, if available, `root_span_uuid`.
190
+ 2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
191
+ 3. If a bound root UUID exists, try to find that exact root on the trace.
192
+ 4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
193
+
194
+ Returns a trace / root tuple. Either may be `None` when no binding is
195
+ present. This function is used by `on_task_done` to robustly mark error/cancel
196
+ states without assuming a single root trace or a root that is still open.
197
+ """
198
+ binding = trace_manager.task_bindings.get(t) or {}
199
+ trace_uuid = binding.get("trace_uuid")
200
+ root_span_uuid = binding.get("root_span_uuid")
201
+
202
+ trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
203
+ root = None
204
+
205
+ if trace and root_span_uuid:
206
+ root = next(
207
+ (rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
208
+ )
209
+
210
+ if trace and root is None:
211
+ root = _pick_root_for_marking(trace)
212
+
213
+ return trace, root
214
+
215
+
100
216
  async def _snapshot_tasks():
101
217
  cur = asyncio.current_task()
102
218
  # `all_tasks` returns tasks for the current running loop only
@@ -115,6 +231,32 @@ def _gather_timeout() -> float:
115
231
  )
116
232
 
117
233
 
234
+ def filter_duplicate_results(
235
+ main_result: TestResult, results: List[TestResult]
236
+ ) -> List[TestResult]:
237
+ return [
238
+ result
239
+ for result in results
240
+ if not (
241
+ (result.input == main_result.input)
242
+ and (result.actual_output == main_result.actual_output)
243
+ and (result.metrics_data == main_result.metrics_data)
244
+ )
245
+ ]
246
+
247
+
248
+ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
249
+ token = set_outer_deadline(timeout)
250
+ try:
251
+ if inspect.isawaitable(obj):
252
+ coro = obj
253
+ else:
254
+ coro = obj(*args, **kwargs)
255
+ return await asyncio.wait_for(coro, timeout=timeout)
256
+ finally:
257
+ reset_outer_deadline(token)
258
+
259
+
118
260
  ###########################################
119
261
  ### E2E Evals #############################
120
262
  ###########################################
@@ -146,6 +288,13 @@ def execute_test_cases(
146
288
 
147
289
  test_run_manager.save_to_disk = cache_config.write_cache
148
290
  test_run = test_run_manager.get_test_run(identifier=identifier)
291
+ if test_run is None:
292
+ # ensure we have a test_run ( in case it couldn't be loaded from disk )
293
+ test_run_manager.create_test_run(identifier=identifier)
294
+ test_run = test_run_manager.get_test_run(identifier=identifier)
295
+
296
+ # capture once for inner closures
297
+ hyperparameters = test_run.hyperparameters if test_run is not None else None
149
298
 
150
299
  if display_config.verbose_mode is not None:
151
300
  for metric in metrics:
@@ -166,176 +315,228 @@ def execute_test_cases(
166
315
  test_results: List[TestResult] = []
167
316
 
168
317
  def evaluate_test_cases(
169
- progress: Optional[Progress] = None, pbar_id: Optional[str] = None
318
+ progress: Optional[Progress] = None, pbar_id: Optional[int] = None
170
319
  ):
171
320
  llm_test_case_count = -1
321
+ mllm_test_case_count = -1
172
322
  conversational_test_case_count = -1
173
323
  show_metric_indicator = (
174
324
  display_config.show_indicator and not _use_bar_indicator
175
325
  )
176
326
  for i, test_case in enumerate(test_cases):
327
+ # skip what we know we won't run
328
+ if isinstance(test_case, LLMTestCase):
329
+ if not llm_metrics:
330
+ update_pbar(progress, pbar_id)
331
+ continue
332
+ per_case_total = len(llm_metrics)
333
+ elif isinstance(test_case, MLLMTestCase):
334
+ if not mllm_metrics:
335
+ update_pbar(progress, pbar_id)
336
+ continue
337
+ per_case_total = len(mllm_metrics)
338
+ elif isinstance(test_case, ConversationalTestCase):
339
+ if not conversational_metrics:
340
+ update_pbar(progress, pbar_id)
341
+ continue
342
+ per_case_total = len(conversational_metrics)
343
+
177
344
  pbar_test_case_id = add_pbar(
178
345
  progress,
179
346
  f" 🎯 Evaluating test case #{i}",
180
- total=len(metrics),
347
+ total=per_case_total,
181
348
  )
182
- with capture_evaluation_run("test case"):
183
- for metric in metrics:
184
- metric.error = None # Reset metric error
185
-
186
- if isinstance(test_case, LLMTestCase):
187
- if len(llm_metrics) == 0:
188
- continue
189
-
190
- llm_test_case_count += 1
191
- cached_test_case = None
192
- if cache_config.use_cache:
193
- cached_test_case = (
194
- global_test_run_cache_manager.get_cached_test_case(
195
- test_case, test_run.hyperparameters
196
- )
197
- )
198
-
199
- ##### Metric Calculation #####
200
- api_test_case: LLMApiTestCase = create_api_test_case(
201
- test_case=test_case, index=llm_test_case_count
202
- )
203
- new_cached_test_case: CachedTestCase = CachedTestCase()
204
-
205
- test_start_time = time.perf_counter()
206
- read_all_metrics_from_cache = True
207
- for metric in llm_metrics:
208
- metric_data = None
209
- if cached_test_case is not None:
210
- cached_metric_data = Cache.get_metric_data(
211
- metric, cached_test_case
212
- )
213
- if cached_metric_data:
214
- metric_data = cached_metric_data.metric_data
215
349
 
216
- if metric_data is None:
217
- read_all_metrics_from_cache = False
218
- res = _execute_metric(
219
- metric=metric,
220
- test_case=test_case,
221
- show_metric_indicator=show_metric_indicator,
222
- in_component=False,
223
- error_config=error_config,
224
- )
225
- if res == "skip":
226
- continue
227
- metric_data = create_metric_data(metric)
228
-
229
- # here, we will check for an additional property on the flattened test cases to see if updating is necessary
230
- api_test_case.update_metric_data(metric_data)
231
- if metric.error is None:
232
- cache_metric_data = deepcopy(metric_data)
233
- cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
234
- updated_cached_metric_data = CachedMetricData(
235
- metric_data=cache_metric_data,
236
- metric_configuration=Cache.create_metric_configuration(
237
- metric
238
- ),
239
- )
240
- new_cached_test_case.cached_metrics_data.append(
241
- updated_cached_metric_data
242
- )
243
- update_pbar(progress, pbar_test_case_id)
244
-
245
- test_end_time = time.perf_counter()
246
- if read_all_metrics_from_cache:
247
- run_duration = 0
248
- else:
249
- run_duration = test_end_time - test_start_time
250
- api_test_case.update_run_duration(run_duration)
251
-
252
- ### Update Test Run ###
253
- test_run_manager.update_test_run(api_test_case, test_case)
254
-
255
- ### Cache Test Run ###
256
- global_test_run_cache_manager.cache_test_case(
257
- test_case,
258
- new_cached_test_case,
259
- test_run.hyperparameters,
260
- )
261
- global_test_run_cache_manager.cache_test_case(
262
- test_case,
263
- new_cached_test_case,
264
- test_run.hyperparameters,
265
- to_temp=True,
350
+ metrics_for_case = (
351
+ llm_metrics
352
+ if isinstance(test_case, LLMTestCase)
353
+ else (
354
+ mllm_metrics
355
+ if isinstance(test_case, MLLMTestCase)
356
+ else conversational_metrics
357
+ )
358
+ )
359
+ api_test_case = create_api_test_case(
360
+ test_case=test_case,
361
+ index=(
362
+ llm_test_case_count + 1
363
+ if isinstance(test_case, LLMTestCase)
364
+ else (
365
+ mllm_test_case_count + 1
366
+ if isinstance(test_case, MLLMTestCase)
367
+ else conversational_test_case_count + 1
266
368
  )
369
+ ),
370
+ )
371
+ emitted = [False] * len(metrics_for_case)
372
+ index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
373
+ current_index = -1
374
+ start_time = time.perf_counter()
375
+ deadline_timeout = _per_task_timeout()
376
+ deadline_token = set_outer_deadline(deadline_timeout)
377
+ new_cached_test_case: CachedTestCase = None
378
+ try:
267
379
 
268
- # No caching and not sending test cases to Confident AI for multimodal metrics yet
269
- elif isinstance(test_case, MLLMTestCase):
270
- if len(mllm_metrics) == 0:
380
+ def _run_case():
381
+ nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
382
+ with capture_evaluation_run("test case"):
383
+ for metric in metrics:
384
+ metric.error = None # Reset metric error
385
+
386
+ if isinstance(test_case, LLMTestCase):
387
+ llm_test_case_count += 1
388
+ cached_test_case = None
389
+ if cache_config.use_cache:
390
+ cached_test_case = global_test_run_cache_manager.get_cached_test_case(
391
+ test_case, hyperparameters
392
+ )
393
+
394
+ ##### Metric Calculation #####
395
+ new_cached_test_case = CachedTestCase()
396
+
397
+ for metric in llm_metrics:
398
+ current_index = index_of[id(metric)]
399
+ metric_data = None
400
+ if cached_test_case is not None:
401
+ cached_metric_data = Cache.get_metric_data(
402
+ metric, cached_test_case
403
+ )
404
+ if cached_metric_data:
405
+ metric_data = (
406
+ cached_metric_data.metric_data
407
+ )
408
+
409
+ if metric_data is None:
410
+ res = _execute_metric(
411
+ metric=metric,
412
+ test_case=test_case,
413
+ show_metric_indicator=show_metric_indicator,
414
+ in_component=False,
415
+ error_config=error_config,
416
+ )
417
+ if res == "skip":
418
+ continue
419
+ metric_data = create_metric_data(metric)
420
+
421
+ # here, we will check for an additional property on the flattened test cases to see if updating is necessary
422
+ api_test_case.update_metric_data(metric_data)
423
+ emitted[current_index] = True
424
+ if metric.error is None:
425
+ cache_metric_data = deepcopy(metric_data)
426
+ cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
427
+ updated_cached_metric_data = CachedMetricData(
428
+ metric_data=cache_metric_data,
429
+ metric_configuration=Cache.create_metric_configuration(
430
+ metric
431
+ ),
432
+ )
433
+ new_cached_test_case.cached_metrics_data.append(
434
+ updated_cached_metric_data
435
+ )
436
+ update_pbar(progress, pbar_test_case_id)
437
+
438
+ # No caching and not sending test cases to Confident AI for multimodal metrics yet
439
+ elif isinstance(test_case, MLLMTestCase):
440
+ mllm_test_case_count += 1
441
+ for metric in mllm_metrics:
442
+ current_index = index_of[id(metric)]
443
+ res = _execute_metric(
444
+ metric=metric,
445
+ test_case=test_case,
446
+ show_metric_indicator=show_metric_indicator,
447
+ in_component=False,
448
+ error_config=error_config,
449
+ )
450
+ if res == "skip":
451
+ continue
452
+
453
+ metric_data = create_metric_data(metric)
454
+ api_test_case.update_metric_data(metric_data)
455
+ emitted[current_index] = True
456
+ update_pbar(progress, pbar_test_case_id)
457
+
458
+ # No caching for conversational metrics yet
459
+ elif isinstance(test_case, ConversationalTestCase):
460
+ conversational_test_case_count += 1
461
+ for metric in conversational_metrics:
462
+ current_index = index_of[id(metric)]
463
+ res = _execute_metric(
464
+ metric=metric,
465
+ test_case=test_case,
466
+ show_metric_indicator=show_metric_indicator,
467
+ in_component=False,
468
+ error_config=error_config,
469
+ )
470
+ if res == "skip":
471
+ continue
472
+
473
+ metric_data = create_metric_data(metric)
474
+ api_test_case.update_metric_data(metric_data)
475
+ emitted[current_index] = True
476
+ update_pbar(progress, pbar_test_case_id)
477
+
478
+ run_sync_with_timeout(_run_case, deadline_timeout)
479
+ except (asyncio.TimeoutError, TimeoutError):
480
+ msg = (
481
+ f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
482
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
483
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
484
+ )
485
+ for i, m in enumerate(metrics_for_case):
486
+ if getattr(m, "skipped", False):
271
487
  continue
272
-
273
- api_test_case: LLMApiTestCase = create_api_test_case(
274
- test_case=test_case, index=llm_test_case_count
275
- )
276
- test_start_time = time.perf_counter()
277
- for metric in mllm_metrics:
278
- res = _execute_metric(
279
- metric=metric,
280
- test_case=test_case,
281
- show_metric_indicator=show_metric_indicator,
282
- in_component=False,
283
- error_config=error_config,
284
- )
285
- if res == "skip":
286
- continue
287
-
288
- metric_data = create_metric_data(metric)
289
- api_test_case.update_metric_data(metric_data)
290
- update_pbar(progress, pbar_test_case_id)
291
-
292
- test_end_time = time.perf_counter()
293
- if len(mllm_metrics) > 0:
294
- run_duration = test_end_time - test_start_time
295
- api_test_case.update_run_duration(run_duration)
296
-
297
- ### Update Test Run ###
298
- test_run_manager.update_test_run(api_test_case, test_case)
299
-
300
- # No caching for conversational metrics yet
301
- elif isinstance(test_case, ConversationalTestCase):
302
- if len(metrics) == 0:
488
+ # already finished or errored? leave it
489
+ if getattr(m, "success", None) is not None or getattr(
490
+ m, "error", None
491
+ ):
303
492
  continue
493
+ if i == current_index:
494
+ m.success = False
495
+ m.error = msg
496
+ elif i > current_index:
497
+ m.success = False
498
+ m.error = "Skipped due to case timeout."
499
+
500
+ if not error_config.ignore_errors:
501
+ raise
304
502
 
305
- conversational_test_case_count += 1
306
- api_test_case: ConversationalApiTestCase = (
307
- create_api_test_case(
308
- test_case=test_case,
309
- index=conversational_test_case_count,
503
+ finally:
504
+ try:
505
+ if (
506
+ isinstance(test_case, LLMTestCase)
507
+ and new_cached_test_case is not None
508
+ ):
509
+ ### Cache Test Run ###
510
+ global_test_run_cache_manager.cache_test_case(
511
+ test_case,
512
+ new_cached_test_case,
513
+ hyperparameters,
310
514
  )
311
- )
312
-
313
- test_start_time = time.perf_counter()
314
- for metric in metrics:
315
- res = _execute_metric(
316
- metric=metric,
317
- test_case=test_case,
318
- show_metric_indicator=show_metric_indicator,
319
- in_component=False,
320
- error_config=error_config,
515
+ global_test_run_cache_manager.cache_test_case(
516
+ test_case,
517
+ new_cached_test_case,
518
+ hyperparameters,
519
+ to_temp=True,
321
520
  )
322
- if res == "skip":
323
- continue
324
-
325
- metric_data = create_metric_data(metric)
326
- api_test_case.update_metric_data(metric_data)
327
- update_pbar(progress, pbar_test_case_id)
328
521
 
329
- test_end_time = time.perf_counter()
330
- run_duration = test_end_time - test_start_time
331
- api_test_case.update_run_duration(run_duration)
522
+ # Attach MetricData for *all* metrics (finished or synthesized)
523
+ for i, m in enumerate(metrics_for_case):
524
+ if getattr(m, "skipped", False):
525
+ continue
526
+ if not emitted[i]:
527
+ api_test_case.update_metric_data(
528
+ create_metric_data(m)
529
+ )
332
530
 
333
- ### Update Test Run ###
531
+ elapsed = time.perf_counter() - start_time
532
+ api_test_case.update_run_duration(
533
+ elapsed if elapsed >= 0 else deadline_timeout
534
+ )
334
535
  test_run_manager.update_test_run(api_test_case, test_case)
335
-
336
- test_result = create_test_result(api_test_case)
337
- test_results.append(test_result)
338
- update_pbar(progress, pbar_id)
536
+ test_results.append(create_test_result(api_test_case))
537
+ update_pbar(progress, pbar_id)
538
+ finally:
539
+ reset_outer_deadline(deadline_token)
339
540
 
340
541
  if display_config.show_indicator and _use_bar_indicator:
341
542
  progress = Progress(
@@ -380,7 +581,10 @@ async def a_execute_test_cases(
380
581
 
381
582
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
382
583
  async with semaphore:
383
- return await func(*args, **kwargs)
584
+ timeout = _per_task_timeout()
585
+ return await _await_with_outer_deadline(
586
+ func, *args, timeout=timeout, **kwargs
587
+ )
384
588
 
385
589
  global_test_run_cache_manager.disable_write_cache = (
386
590
  cache_config.write_cache is False
@@ -483,7 +687,7 @@ async def a_execute_test_cases(
483
687
 
484
688
  task = execute_with_semaphore(
485
689
  func=_a_execute_conversational_test_cases,
486
- metrics=copy_metrics(metrics),
690
+ metrics=copy_metrics(conversational_metrics),
487
691
  test_case=test_case,
488
692
  test_run_manager=test_run_manager,
489
693
  test_results=test_results,
@@ -499,7 +703,22 @@ async def a_execute_test_cases(
499
703
  tasks.append(asyncio.create_task(task))
500
704
 
501
705
  await asyncio.sleep(async_config.throttle_value)
502
- await asyncio.gather(*tasks)
706
+
707
+ try:
708
+ await asyncio.wait_for(
709
+ asyncio.gather(*tasks),
710
+ timeout=_gather_timeout(),
711
+ )
712
+ except (asyncio.TimeoutError, TimeoutError):
713
+ for t in tasks:
714
+ if not t.done():
715
+ t.cancel()
716
+ await asyncio.gather(*tasks, return_exceptions=True)
717
+ logging.getLogger("deepeval").error(
718
+ "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
719
+ _gather_timeout(),
720
+ )
721
+
503
722
  else:
504
723
  for test_case in test_cases:
505
724
  with capture_evaluation_run("test case"):
@@ -572,7 +791,19 @@ async def a_execute_test_cases(
572
791
  tasks.append(asyncio.create_task(task))
573
792
 
574
793
  await asyncio.sleep(async_config.throttle_value)
575
- await asyncio.gather(*tasks)
794
+
795
+ try:
796
+ await asyncio.wait_for(
797
+ asyncio.gather(*tasks),
798
+ timeout=_gather_timeout(),
799
+ )
800
+ except (asyncio.TimeoutError, TimeoutError):
801
+ # Cancel any still-pending tasks and drain them
802
+ for t in tasks:
803
+ if not t.done():
804
+ t.cancel()
805
+ await asyncio.gather(*tasks, return_exceptions=True)
806
+ raise
576
807
 
577
808
  return test_results
578
809
 
@@ -593,6 +824,7 @@ async def _a_execute_llm_test_cases(
593
824
  progress: Optional[Progress] = None,
594
825
  pbar_id: Optional[int] = None,
595
826
  ):
827
+ logger.info("in _a_execute_llm_test_cases")
596
828
  pbar_test_case_id = add_pbar(
597
829
  progress,
598
830
  f" 🎯 Evaluating test case #{count}",
@@ -616,64 +848,85 @@ async def _a_execute_llm_test_cases(
616
848
  api_test_case = create_api_test_case(
617
849
  test_case=test_case, index=count if not _is_assert_test else None
618
850
  )
619
- new_cached_test_case: CachedTestCase = CachedTestCase()
620
- test_start_time = time.perf_counter()
621
- await measure_metrics_with_indicator(
622
- metrics=metrics,
623
- test_case=test_case,
624
- cached_test_case=cached_test_case,
625
- skip_on_missing_params=skip_on_missing_params,
626
- ignore_errors=ignore_errors,
627
- show_indicator=show_metrics_indicator,
628
- pbar_eval_id=pbar_test_case_id,
629
- progress=progress,
630
- )
851
+ try:
852
+ new_cached_test_case: CachedTestCase = CachedTestCase()
853
+ test_start_time = time.perf_counter()
631
854
 
632
- for metric in metrics:
633
- if metric.skipped:
634
- continue
855
+ await measure_metrics_with_indicator(
856
+ metrics=metrics,
857
+ test_case=test_case,
858
+ cached_test_case=cached_test_case,
859
+ skip_on_missing_params=skip_on_missing_params,
860
+ ignore_errors=ignore_errors,
861
+ show_indicator=show_metrics_indicator,
862
+ pbar_eval_id=pbar_test_case_id,
863
+ progress=progress,
864
+ )
865
+ except asyncio.CancelledError:
866
+ msg = (
867
+ "Timed out/cancelled while evaluating metric. "
868
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
869
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
870
+ )
871
+ for m in metrics:
872
+ if getattr(m, "skipped", False):
873
+ continue
874
+ # If the task never finished and didn't set a terminal state, mark it now
875
+ if getattr(m, "success", None) is None and not getattr(
876
+ m, "error", None
877
+ ):
878
+ m.success = False
879
+ m.error = msg
880
+ if not ignore_errors:
881
+ raise
882
+ finally:
883
+ for metric in metrics:
884
+ if metric.skipped:
885
+ continue
635
886
 
636
- metric_data = create_metric_data(metric)
637
- api_test_case.update_metric_data(metric_data)
887
+ metric_data = create_metric_data(metric)
888
+ api_test_case.update_metric_data(metric_data)
638
889
 
639
- if metric.error is None:
640
- cache_metric_data = deepcopy(metric_data)
641
- cache_metric_data.evaluation_cost = (
642
- 0 # Create new copy and save 0 for cost
643
- )
644
- updated_cached_metric_data = CachedMetricData(
645
- metric_data=cache_metric_data,
646
- metric_configuration=Cache.create_metric_configuration(metric),
647
- )
648
- new_cached_test_case.cached_metrics_data.append(
649
- updated_cached_metric_data
650
- )
890
+ if metric.error is None:
891
+ cache_metric_data = deepcopy(metric_data)
892
+ cache_metric_data.evaluation_cost = (
893
+ 0 # Create new copy and save 0 for cost
894
+ )
895
+ updated_cached_metric_data = CachedMetricData(
896
+ metric_data=cache_metric_data,
897
+ metric_configuration=Cache.create_metric_configuration(
898
+ metric
899
+ ),
900
+ )
901
+ new_cached_test_case.cached_metrics_data.append(
902
+ updated_cached_metric_data
903
+ )
651
904
 
652
- test_end_time = time.perf_counter()
653
- run_duration = test_end_time - test_start_time
654
- # Quick hack to check if all metrics were from cache
655
- if run_duration < 1:
656
- run_duration = 0
657
- api_test_case.update_run_duration(run_duration)
658
-
659
- ### Update Test Run ###
660
- test_run_manager.update_test_run(api_test_case, test_case)
661
-
662
- ### Cache Test Run ###
663
- global_test_run_cache_manager.cache_test_case(
664
- test_case,
665
- new_cached_test_case,
666
- test_run.hyperparameters,
667
- )
668
- global_test_run_cache_manager.cache_test_case(
669
- test_case,
670
- new_cached_test_case,
671
- test_run.hyperparameters,
672
- to_temp=True,
673
- )
905
+ test_end_time = time.perf_counter()
906
+ run_duration = test_end_time - test_start_time
907
+ # Quick hack to check if all metrics were from cache
908
+ if run_duration < 1:
909
+ run_duration = 0
910
+ api_test_case.update_run_duration(run_duration)
911
+
912
+ ### Update Test Run ###
913
+ test_run_manager.update_test_run(api_test_case, test_case)
914
+
915
+ ### Cache Test Run ###
916
+ global_test_run_cache_manager.cache_test_case(
917
+ test_case,
918
+ new_cached_test_case,
919
+ test_run.hyperparameters,
920
+ )
921
+ global_test_run_cache_manager.cache_test_case(
922
+ test_case,
923
+ new_cached_test_case,
924
+ test_run.hyperparameters,
925
+ to_temp=True,
926
+ )
674
927
 
675
- test_results.append(create_test_result(api_test_case))
676
- update_pbar(progress, pbar_id)
928
+ test_results.append(create_test_result(api_test_case))
929
+ update_pbar(progress, pbar_id)
677
930
 
678
931
 
679
932
  async def _a_execute_mllm_test_cases(
@@ -705,31 +958,50 @@ async def _a_execute_mllm_test_cases(
705
958
  test_case=test_case, index=count if not _is_assert_test else None
706
959
  )
707
960
  test_start_time = time.perf_counter()
708
- await measure_metrics_with_indicator(
709
- metrics=metrics,
710
- test_case=test_case,
711
- cached_test_case=None,
712
- skip_on_missing_params=skip_on_missing_params,
713
- ignore_errors=ignore_errors,
714
- show_indicator=show_metrics_indicator,
715
- pbar_eval_id=pbar_test_case_id,
716
- progress=progress,
717
- )
718
- for metric in metrics:
719
- if metric.skipped:
720
- continue
961
+ try:
962
+ await measure_metrics_with_indicator(
963
+ metrics=metrics,
964
+ test_case=test_case,
965
+ cached_test_case=None,
966
+ skip_on_missing_params=skip_on_missing_params,
967
+ ignore_errors=ignore_errors,
968
+ show_indicator=show_metrics_indicator,
969
+ pbar_eval_id=pbar_test_case_id,
970
+ progress=progress,
971
+ )
972
+ except asyncio.CancelledError:
973
+ msg = (
974
+ "Timed out/cancelled while evaluating metric. "
975
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
976
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
977
+ )
978
+ for m in metrics:
979
+ if getattr(m, "skipped", False):
980
+ continue
981
+ # If the task never finished and didn't set a terminal state, mark it now
982
+ if getattr(m, "success", None) is None and not getattr(
983
+ m, "error", None
984
+ ):
985
+ m.success = False
986
+ m.error = msg
987
+ if not ignore_errors:
988
+ raise
989
+ finally:
990
+ for metric in metrics:
991
+ if metric.skipped:
992
+ continue
721
993
 
722
- metric_data = create_metric_data(metric)
723
- api_test_case.update_metric_data(metric_data)
994
+ metric_data = create_metric_data(metric)
995
+ api_test_case.update_metric_data(metric_data)
724
996
 
725
- test_end_time = time.perf_counter()
726
- run_duration = test_end_time - test_start_time
727
- api_test_case.update_run_duration(run_duration)
997
+ test_end_time = time.perf_counter()
998
+ run_duration = test_end_time - test_start_time
999
+ api_test_case.update_run_duration(run_duration)
728
1000
 
729
- ### Update Test Run ###
730
- test_run_manager.update_test_run(api_test_case, test_case)
731
- test_results.append(create_test_result(api_test_case))
732
- update_pbar(progress, pbar_id)
1001
+ ### Update Test Run ###
1002
+ test_run_manager.update_test_run(api_test_case, test_case)
1003
+ test_results.append(create_test_result(api_test_case))
1004
+ update_pbar(progress, pbar_id)
733
1005
 
734
1006
 
735
1007
  async def _a_execute_conversational_test_cases(
@@ -764,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
764
1036
  )
765
1037
 
766
1038
  test_start_time = time.perf_counter()
767
- await measure_metrics_with_indicator(
768
- metrics=metrics,
769
- test_case=test_case,
770
- cached_test_case=None,
771
- skip_on_missing_params=skip_on_missing_params,
772
- ignore_errors=ignore_errors,
773
- show_indicator=show_metrics_indicator,
774
- pbar_eval_id=pbar_test_case_id,
775
- progress=progress,
776
- )
777
- for metric in metrics:
778
- if metric.skipped:
779
- continue
780
1039
 
781
- metric_data = create_metric_data(metric)
782
- api_test_case.update_metric_data(metric_data)
1040
+ try:
1041
+ await measure_metrics_with_indicator(
1042
+ metrics=metrics,
1043
+ test_case=test_case,
1044
+ cached_test_case=None,
1045
+ skip_on_missing_params=skip_on_missing_params,
1046
+ ignore_errors=ignore_errors,
1047
+ show_indicator=show_metrics_indicator,
1048
+ pbar_eval_id=pbar_test_case_id,
1049
+ progress=progress,
1050
+ )
783
1051
 
784
- test_end_time = time.perf_counter()
785
- if len(metrics) > 0:
786
- run_duration = test_end_time - test_start_time
787
- api_test_case.update_run_duration(run_duration)
1052
+ except asyncio.CancelledError:
1053
+ msg = (
1054
+ "Timed out/cancelled while evaluating metric. "
1055
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1056
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1057
+ )
1058
+ for m in metrics:
1059
+ if getattr(m, "skipped", False):
1060
+ continue
1061
+ # If the task never finished and didn't set a terminal state, mark it now
1062
+ if getattr(m, "success", None) is None and not getattr(
1063
+ m, "error", None
1064
+ ):
1065
+ m.success = False
1066
+ m.error = msg
1067
+ if not ignore_errors:
1068
+ raise
1069
+
1070
+ finally:
1071
+ for metric in metrics:
1072
+ if metric.skipped:
1073
+ continue
1074
+
1075
+ metric_data = create_metric_data(metric)
1076
+ api_test_case.update_metric_data(metric_data)
1077
+
1078
+ test_end_time = time.perf_counter()
1079
+ if len(metrics) > 0:
1080
+ run_duration = test_end_time - test_start_time
1081
+ api_test_case.update_run_duration(run_duration)
788
1082
 
789
- ### Update Test Run ###
790
- test_run_manager.update_test_run(api_test_case, test_case)
1083
+ ### Update Test Run ###
1084
+ test_run_manager.update_test_run(api_test_case, test_case)
791
1085
 
792
- test_results.append(create_test_result(api_test_case))
793
- update_pbar(progress, pbar_id)
1086
+ test_results.append(create_test_result(api_test_case))
1087
+ update_pbar(progress, pbar_id)
794
1088
 
795
1089
 
796
1090
  ###########################################
@@ -814,7 +1108,11 @@ def execute_agentic_test_cases(
814
1108
  test_run_manager = global_test_run_manager
815
1109
 
816
1110
  test_run_manager.save_to_disk = cache_config.write_cache
817
- test_run_manager.get_test_run(identifier=identifier)
1111
+ test_run = test_run_manager.get_test_run(identifier=identifier)
1112
+ if test_run is None:
1113
+ # Create if not found
1114
+ test_run_manager.create_test_run(identifier=identifier)
1115
+ test_run = test_run_manager.get_test_run(identifier=identifier)
818
1116
 
819
1117
  local_trace_manager = trace_manager
820
1118
  local_trace_manager.evaluating = True
@@ -824,244 +1122,526 @@ def execute_agentic_test_cases(
824
1122
  progress: Optional[Progress] = None,
825
1123
  pbar_id: Optional[int] = None,
826
1124
  ):
827
- count = 0
1125
+ count = -1
828
1126
  show_metric_indicator = (
829
1127
  display_config.show_indicator and not _use_bar_indicator
830
1128
  )
831
1129
 
832
1130
  for golden in goldens:
833
- with capture_evaluation_run("golden"):
834
- count += 1
835
- total_tags = count_observe_decorators_in_module(
836
- observed_callback
837
- )
838
- pbar_tags_id = add_pbar(
839
- progress,
840
- f" ⚡ Invoking observed callback (#{count})",
841
- total=total_tags,
842
- )
843
-
844
- with Observer(
845
- "custom",
846
- func_name="Test Wrapper",
847
- _progress=progress,
848
- _pbar_callback_id=pbar_tags_id,
849
- ):
850
- if asyncio.iscoroutinefunction(observed_callback):
851
- loop = get_or_create_event_loop()
852
- coro = observed_callback(golden.input)
853
- loop.run_until_complete(
854
- asyncio.wait_for(
855
- coro,
856
- timeout=_per_task_timeout(),
857
- )
858
- )
859
- else:
860
- observed_callback(golden.input)
861
- current_trace: Trace = current_trace_context.get()
1131
+ count += 1
862
1132
 
863
- update_pbar(progress, pbar_tags_id, advance=total_tags)
864
- update_pbar(progress, pbar_id)
1133
+ pbar_case_increments = (
1134
+ 0 # tracks how many times we advance `pbar_id` for this golden
1135
+ )
1136
+ emitted_trace = set()
1137
+ current_trace: Optional[Trace] = None
1138
+ trace_api = None
1139
+ api_test_case = None
1140
+ test_case = None
1141
+
1142
+ def _run_golden():
1143
+ nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
1144
+ # keep the evaluation context inside the timed function
1145
+ with capture_evaluation_run("golden"):
1146
+ total_tags = count_observe_decorators_in_module(
1147
+ observed_callback
1148
+ )
1149
+ pbar_tags_id = add_pbar(
1150
+ progress,
1151
+ f" ⚡ Invoking observed callback (#{count})",
1152
+ total=total_tags,
1153
+ )
865
1154
 
866
- # Create empty trace api for llm api test case
867
- trace_api = create_api_trace(current_trace, golden)
1155
+ with Observer(
1156
+ "custom",
1157
+ func_name="Test Wrapper",
1158
+ _progress=progress,
1159
+ _pbar_callback_id=pbar_tags_id,
1160
+ ):
1161
+ if asyncio.iscoroutinefunction(observed_callback):
1162
+ loop = get_or_create_event_loop()
1163
+ coro = observed_callback(golden.input)
1164
+ loop.run_until_complete(
1165
+ _await_with_outer_deadline(
1166
+ coro,
1167
+ timeout=_per_task_timeout(),
1168
+ )
1169
+ )
1170
+ else:
1171
+ observed_callback(golden.input)
868
1172
 
869
- # Format golden as test case to create llm api test case
870
- test_case = LLMTestCase(
871
- input=golden.input,
872
- actual_output=(
873
- str(current_trace.output)
874
- if current_trace.output is not None
875
- else None
876
- ),
877
- expected_output=current_trace.expected_output,
878
- context=current_trace.context,
879
- retrieval_context=current_trace.retrieval_context,
880
- additional_metadata=golden.additional_metadata,
881
- tools_called=current_trace.tools_called,
882
- expected_tools=current_trace.expected_tools,
883
- comments=golden.comments,
884
- name=golden.name,
885
- _dataset_alias=golden._dataset_alias,
886
- _dataset_id=golden._dataset_id,
887
- )
888
- api_test_case = create_api_test_case(
889
- test_case=test_case,
890
- trace=trace_api,
891
- index=count if not _is_assert_test else None,
892
- )
1173
+ # we have a trace now
1174
+ current_trace = current_trace_context.get()
893
1175
 
894
- # Run DFS to calculate metrics synchronously
895
- def dfs(
896
- span: BaseSpan,
897
- progress: Optional[Progress] = None,
898
- pbar_eval_id: Optional[int] = None,
899
- ):
900
- # Create API Span
901
- metrics: List[BaseMetric] = span.metrics
902
- api_span: BaseApiSpan = (
903
- trace_manager._convert_span_to_api_span(span)
1176
+ update_pbar(progress, pbar_tags_id, advance=total_tags)
1177
+ update_pbar(progress, pbar_id)
1178
+ pbar_case_increments += 1
1179
+
1180
+ # Create empty trace api for llm api test case
1181
+ trace_api = create_api_trace(current_trace, golden)
1182
+
1183
+ # Build the test case and api test case
1184
+ test_case = LLMTestCase(
1185
+ input=golden.input,
1186
+ actual_output=(
1187
+ str(current_trace.output)
1188
+ if current_trace
1189
+ and current_trace.output is not None
1190
+ else None
1191
+ ),
1192
+ expected_output=(
1193
+ current_trace.expected_output
1194
+ if current_trace
1195
+ else None
1196
+ ),
1197
+ context=(
1198
+ current_trace.context if current_trace else None
1199
+ ),
1200
+ retrieval_context=(
1201
+ current_trace.retrieval_context
1202
+ if current_trace
1203
+ else None
1204
+ ),
1205
+ additional_metadata=golden.additional_metadata,
1206
+ tools_called=(
1207
+ current_trace.tools_called
1208
+ if current_trace
1209
+ else None
1210
+ ),
1211
+ expected_tools=(
1212
+ current_trace.expected_tools
1213
+ if current_trace
1214
+ else None
1215
+ ),
1216
+ comments=golden.comments,
1217
+ name=golden.name,
1218
+ _dataset_alias=golden._dataset_alias,
1219
+ _dataset_id=golden._dataset_id,
904
1220
  )
905
- if isinstance(span, AgentSpan):
906
- trace_api.agent_spans.append(api_span)
907
- elif isinstance(span, LlmSpan):
908
- trace_api.llm_spans.append(api_span)
909
- log_prompt(span, test_run_manager)
910
- elif isinstance(span, RetrieverSpan):
911
- trace_api.retriever_spans.append(api_span)
912
- elif isinstance(span, ToolSpan):
913
- trace_api.tool_spans.append(api_span)
914
- else:
915
- trace_api.base_spans.append(api_span)
916
-
917
- for child in span.children:
918
- dfs(child, progress, pbar_eval_id)
919
-
920
- if span.metrics is None:
921
- return
922
- has_task_completion = any(
923
- isinstance(metric, TaskCompletionMetric)
924
- for metric in span.metrics
1221
+ api_test_case = create_api_test_case(
1222
+ test_case=test_case,
1223
+ trace=trace_api,
1224
+ index=count if not _is_assert_test else None,
925
1225
  )
926
1226
 
927
- llm_test_case = None
928
- if span.input is not None:
929
- llm_test_case = LLMTestCase(
930
- input=str(span.input),
931
- actual_output=(
932
- str(span.output)
933
- if span.output is not None
934
- else None
935
- ),
936
- expected_output=span.expected_output,
937
- context=span.context,
938
- retrieval_context=span.retrieval_context,
939
- tools_called=span.tools_called,
940
- expected_tools=span.expected_tools,
1227
+ # DFS and trace metric evaluation
1228
+ def dfs(
1229
+ span: BaseSpan,
1230
+ progress: Optional[Progress] = None,
1231
+ pbar_eval_id: Optional[int] = None,
1232
+ ):
1233
+ metrics: List[BaseMetric] = list(span.metrics or [])
1234
+ api_span: BaseApiSpan = (
1235
+ trace_manager._convert_span_to_api_span(span)
941
1236
  )
942
- if llm_test_case is None and not has_task_completion:
943
- raise ValueError(
944
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
1237
+
1238
+ if isinstance(span, AgentSpan):
1239
+ trace_api.agent_spans.append(api_span)
1240
+ elif isinstance(span, LlmSpan):
1241
+ trace_api.llm_spans.append(api_span)
1242
+ log_prompt(span, test_run_manager)
1243
+ elif isinstance(span, RetrieverSpan):
1244
+ trace_api.retriever_spans.append(api_span)
1245
+ elif isinstance(span, ToolSpan):
1246
+ trace_api.tool_spans.append(api_span)
1247
+ else:
1248
+ trace_api.base_spans.append(api_span)
1249
+
1250
+ if _skip_metrics_for_error(
1251
+ span=span, trace=current_trace
1252
+ ):
1253
+ api_span.status = TraceSpanApiStatus.ERRORED
1254
+ api_span.error = span.error or _trace_error(
1255
+ current_trace
1256
+ )
1257
+ if progress and pbar_eval_id is not None:
1258
+ update_pbar(
1259
+ progress,
1260
+ pbar_eval_id,
1261
+ advance=count_metrics_in_span_subtree(span),
1262
+ )
1263
+ return
1264
+
1265
+ # evaluate children first
1266
+ for child in span.children:
1267
+ dfs(child, progress, pbar_eval_id)
1268
+
1269
+ # If there are no metrics, then there is nothing to do on this span.
1270
+ if not metrics:
1271
+ return
1272
+
1273
+ has_task_completion = any(
1274
+ isinstance(metric, TaskCompletionMetric)
1275
+ for metric in metrics
945
1276
  )
946
1277
 
947
- # add trace if task completion
948
- if has_task_completion:
949
- if llm_test_case is None:
950
- llm_test_case = LLMTestCase(input="None")
951
- llm_test_case._trace_dict = (
952
- trace_manager.create_nested_spans_dict(span)
1278
+ requires_trace = any(
1279
+ getattr(metric, "requires_trace", False)
1280
+ for metric in metrics
953
1281
  )
954
1282
 
955
- # Preparing metric calculation
956
- api_span.metrics_data = []
957
- for metric in metrics:
958
- metric.skipped = False
959
- metric.error = None
960
- if display_config.verbose_mode is not None:
961
- metric.verbose_mode = display_config.verbose_mode
1283
+ llm_test_case = None
1284
+ if span.input is not None:
1285
+ llm_test_case = LLMTestCase(
1286
+ input=str(span.input),
1287
+ actual_output=(
1288
+ str(span.output)
1289
+ if span.output is not None
1290
+ else None
1291
+ ),
1292
+ expected_output=span.expected_output,
1293
+ context=span.context,
1294
+ retrieval_context=span.retrieval_context,
1295
+ tools_called=span.tools_called,
1296
+ expected_tools=span.expected_tools,
1297
+ )
962
1298
 
963
- # Metric calculation
964
- for metric in metrics:
965
- metric_data = None
966
- res = _execute_metric(
967
- metric=metric,
968
- test_case=llm_test_case,
969
- show_metric_indicator=show_metric_indicator,
970
- in_component=True,
971
- error_config=error_config,
972
- )
973
- if res == "skip":
974
- continue
975
- metric_data = create_metric_data(metric)
976
- api_span.metrics_data.append(metric_data)
977
- api_test_case.update_status(metric_data.success)
978
- update_pbar(progress, pbar_eval_id)
1299
+ # If any metric needs a trace tree or a completion verdict, attach the trace
1300
+ if has_task_completion or requires_trace:
1301
+ if llm_test_case is None:
1302
+ llm_test_case = LLMTestCase(input="None")
1303
+ llm_test_case._trace_dict = (
1304
+ trace_manager.create_nested_spans_dict(span)
1305
+ )
1306
+ else:
1307
+ # Without a test case we cannot evaluate span metrics
1308
+ if llm_test_case is None:
1309
+ api_span.status = TraceSpanApiStatus.ERRORED
1310
+ api_span.error = format_error_text(
1311
+ DeepEvalError(
1312
+ "Span has metrics but no LLMTestCase. "
1313
+ "Are you sure you called `update_current_span()`?"
1314
+ )
1315
+ )
1316
+ if progress and pbar_eval_id is not None:
1317
+ update_pbar(
1318
+ progress,
1319
+ pbar_eval_id,
1320
+ advance=count_metrics_in_span_subtree(
1321
+ span
1322
+ ),
1323
+ )
1324
+ return
1325
+
1326
+ # Preparing metric calculation
1327
+ api_span.metrics_data = []
1328
+ for metric in metrics:
1329
+ metric.skipped = False
1330
+ metric.error = None
1331
+ if display_config.verbose_mode is not None:
1332
+ metric.verbose_mode = (
1333
+ display_config.verbose_mode
1334
+ )
1335
+
1336
+ # Metric calculation
1337
+ for metric in metrics:
1338
+ res = _execute_metric(
1339
+ metric=metric,
1340
+ test_case=llm_test_case,
1341
+ show_metric_indicator=show_metric_indicator,
1342
+ in_component=True,
1343
+ error_config=error_config,
1344
+ )
1345
+ if res == "skip":
1346
+ continue
1347
+ metric_data = create_metric_data(metric)
1348
+ api_span.metrics_data.append(metric_data)
1349
+ api_test_case.update_status(metric_data.success)
1350
+ update_pbar(progress, pbar_eval_id)
979
1351
 
980
- trace_level_metrics_count = (
981
- len(current_trace.metrics) if current_trace.metrics else 0
982
- )
983
- pbar_eval_id = add_pbar(
984
- progress,
985
- f" 🎯 Evaluating component(s) (#{count})",
986
- total=count_metrics_in_trace(trace=current_trace)
987
- + trace_level_metrics_count,
1352
+ trace_level_metrics_count = (
1353
+ len(current_trace.metrics)
1354
+ if current_trace and current_trace.metrics
1355
+ else 0
1356
+ )
1357
+ pbar_eval_id = add_pbar(
1358
+ progress,
1359
+ f" 🎯 Evaluating component(s) (#{count})",
1360
+ total=count_metrics_in_trace(trace=current_trace)
1361
+ + trace_level_metrics_count,
1362
+ )
1363
+
1364
+ start_time = time.perf_counter()
1365
+
1366
+ skip_metrics_for_this_golden = False
1367
+ if _skip_metrics_for_error(trace=current_trace):
1368
+ trace_api.status = TraceSpanApiStatus.ERRORED
1369
+ if progress and pbar_eval_id is not None:
1370
+ update_pbar(
1371
+ progress,
1372
+ pbar_eval_id,
1373
+ advance=count_total_metrics_for_trace(
1374
+ current_trace
1375
+ ),
1376
+ )
1377
+ else:
1378
+ if current_trace and current_trace.metrics:
1379
+ has_task_completion = any(
1380
+ isinstance(metric, TaskCompletionMetric)
1381
+ for metric in current_trace.metrics
1382
+ )
1383
+ requires_trace = any(
1384
+ getattr(metric, "requires_trace", False)
1385
+ for metric in current_trace.metrics
1386
+ )
1387
+ llm_test_case = None
1388
+ if current_trace.input:
1389
+ llm_test_case = LLMTestCase(
1390
+ input=str(current_trace.input),
1391
+ actual_output=(
1392
+ str(current_trace.output)
1393
+ if current_trace.output is not None
1394
+ else None
1395
+ ),
1396
+ expected_output=current_trace.expected_output,
1397
+ context=current_trace.context,
1398
+ retrieval_context=current_trace.retrieval_context,
1399
+ tools_called=current_trace.tools_called,
1400
+ expected_tools=current_trace.expected_tools,
1401
+ )
1402
+ if has_task_completion or requires_trace:
1403
+ if llm_test_case is None:
1404
+ llm_test_case = LLMTestCase(input="None")
1405
+ llm_test_case._trace_dict = (
1406
+ trace_manager.create_nested_spans_dict(
1407
+ current_trace.root_spans[0]
1408
+ )
1409
+ )
1410
+ else:
1411
+ if llm_test_case is None:
1412
+ current_trace.status = (
1413
+ TraceSpanStatus.ERRORED
1414
+ )
1415
+ trace_api.status = (
1416
+ TraceSpanApiStatus.ERRORED
1417
+ )
1418
+ if current_trace.root_spans:
1419
+ current_trace.root_spans[0].status = (
1420
+ TraceSpanStatus.ERRORED
1421
+ )
1422
+ current_trace.root_spans[0].error = (
1423
+ format_error_text(
1424
+ DeepEvalError(
1425
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1426
+ "Are you sure you called `update_current_trace()`?"
1427
+ )
1428
+ )
1429
+ )
1430
+ if progress and pbar_eval_id is not None:
1431
+ update_pbar(
1432
+ progress,
1433
+ pbar_eval_id,
1434
+ advance=count_total_metrics_for_trace(
1435
+ current_trace
1436
+ ),
1437
+ )
1438
+ skip_metrics_for_this_golden = True
1439
+
1440
+ if not skip_metrics_for_this_golden:
1441
+ for metric in current_trace.metrics:
1442
+ metric.skipped = False
1443
+ metric.error = None
1444
+ if display_config.verbose_mode is not None:
1445
+ metric.verbose_mode = (
1446
+ display_config.verbose_mode
1447
+ )
1448
+
1449
+ trace_api.metrics_data = []
1450
+ for metric in current_trace.metrics:
1451
+ res = _execute_metric(
1452
+ metric=metric,
1453
+ test_case=llm_test_case,
1454
+ show_metric_indicator=show_metric_indicator,
1455
+ in_component=True,
1456
+ error_config=error_config,
1457
+ )
1458
+ if res == "skip":
1459
+ continue
1460
+
1461
+ if not metric.skipped:
1462
+ metric_data = create_metric_data(metric)
1463
+ trace_api.metrics_data.append(
1464
+ metric_data
1465
+ )
1466
+ api_test_case.update_metric_data(
1467
+ metric_data
1468
+ )
1469
+ api_test_case.update_status(
1470
+ metric_data.success
1471
+ )
1472
+ emitted_trace.add(id(metric))
1473
+ update_pbar(progress, pbar_eval_id)
1474
+
1475
+ # handle span metrics
1476
+ dfs(
1477
+ current_trace.root_spans[0],
1478
+ progress,
1479
+ pbar_eval_id,
1480
+ )
1481
+
1482
+ # TODO: Do I need this block, or is it duplicated in finally?
1483
+ end_time = time.perf_counter()
1484
+ run_duration = end_time - start_time
1485
+ api_test_case.update_run_duration(run_duration)
1486
+ test_run_manager.update_test_run(api_test_case, test_case)
1487
+ test_results.append(create_test_result(api_test_case))
1488
+ test_results.extend(extract_trace_test_results(trace_api))
1489
+ update_pbar(progress, pbar_id)
1490
+ pbar_case_increments += 1
1491
+
1492
+ # run the golden with a timeout
1493
+ start_time = time.perf_counter()
1494
+ deadline = _per_task_timeout()
1495
+
1496
+ try:
1497
+ run_sync_with_timeout(_run_golden, deadline)
1498
+ except (asyncio.TimeoutError, TimeoutError):
1499
+ # mark any not yet finished trace level and span level metrics as timed out.
1500
+ msg = (
1501
+ f"Timed out after {deadline:.2f}s while executing agentic test case. "
1502
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1503
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
988
1504
  )
989
1505
 
990
- start_time = time.perf_counter()
1506
+ if current_trace is not None:
1507
+ # Trace-level metrics
1508
+ if getattr(current_trace, "metrics", None):
1509
+ for m in current_trace.metrics:
1510
+ if getattr(m, "skipped", False):
1511
+ continue
1512
+ # if already has a terminal state, leave it alone
1513
+ if getattr(
1514
+ m, "success", None
1515
+ ) is not None or getattr(m, "error", None):
1516
+ continue
1517
+ m.success = False
1518
+ m.error = msg
1519
+
1520
+ # span level metrics, walk the tree
1521
+ def _walk(span):
1522
+ for child in getattr(span, "children", []) or []:
1523
+ _walk(child)
1524
+ for m in list(getattr(span, "metrics", []) or []):
1525
+ if getattr(m, "skipped", False):
1526
+ continue
1527
+ if getattr(
1528
+ m, "success", None
1529
+ ) is not None or getattr(m, "error", None):
1530
+ continue
1531
+ m.success = False
1532
+ m.error = msg
991
1533
 
992
- # Handle trace-level metrics
993
- if current_trace.metrics:
994
- has_task_completion = any(
995
- isinstance(metric, TaskCompletionMetric)
996
- for metric in current_trace.metrics
997
- )
1534
+ for root in getattr(current_trace, "root_spans", []) or []:
1535
+ _walk(root)
998
1536
 
999
- llm_test_case = None
1000
- if current_trace.input:
1001
- llm_test_case = LLMTestCase(
1002
- input=str(current_trace.input),
1003
- actual_output=(
1537
+ # raise if we are not ignoring errors
1538
+ if not error_config.ignore_errors:
1539
+ raise
1540
+
1541
+ finally:
1542
+ try:
1543
+ # Ensure we have an api_test_case to attach results to.
1544
+ if api_test_case is None:
1545
+ # build a minimal test_case
1546
+ if test_case is None:
1547
+ out = (
1004
1548
  str(current_trace.output)
1005
- if current_trace.output is not None
1549
+ if (
1550
+ current_trace is not None
1551
+ and current_trace.output is not None
1552
+ )
1006
1553
  else None
1007
- ),
1008
- expected_output=current_trace.expected_output,
1009
- context=current_trace.context,
1010
- retrieval_context=current_trace.retrieval_context,
1011
- tools_called=current_trace.tools_called,
1012
- expected_tools=current_trace.expected_tools,
1013
- )
1014
- if llm_test_case is None and not has_task_completion:
1015
- raise ValueError(
1016
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1017
- )
1018
-
1019
- if has_task_completion:
1020
- if llm_test_case is None:
1021
- llm_test_case = LLMTestCase(input="None")
1022
- llm_test_case._trace_dict = (
1023
- trace_manager.create_nested_spans_dict(
1024
- current_trace.root_spans[0]
1025
1554
  )
1026
- )
1555
+ test_case = LLMTestCase(
1556
+ input=golden.input,
1557
+ actual_output=out,
1558
+ expected_output=(
1559
+ current_trace.expected_output
1560
+ if current_trace
1561
+ else None
1562
+ ),
1563
+ context=(
1564
+ current_trace.context
1565
+ if current_trace
1566
+ else None
1567
+ ),
1568
+ retrieval_context=(
1569
+ current_trace.retrieval_context
1570
+ if current_trace
1571
+ else None
1572
+ ),
1573
+ additional_metadata=golden.additional_metadata,
1574
+ tools_called=(
1575
+ current_trace.tools_called
1576
+ if current_trace
1577
+ else None
1578
+ ),
1579
+ expected_tools=(
1580
+ current_trace.expected_tools
1581
+ if current_trace
1582
+ else None
1583
+ ),
1584
+ comments=golden.comments,
1585
+ name=golden.name,
1586
+ _dataset_alias=golden._dataset_alias,
1587
+ _dataset_id=golden._dataset_id,
1588
+ )
1027
1589
 
1028
- for metric in current_trace.metrics:
1029
- metric.skipped = False
1030
- metric.error = None
1031
- if display_config.verbose_mode is not None:
1032
- metric.verbose_mode = display_config.verbose_mode
1590
+ # Create a trace API if we have a trace
1591
+ if trace_api is None and current_trace is not None:
1592
+ trace_api = create_api_trace(current_trace, golden)
1033
1593
 
1034
- trace_api.metrics_data = []
1035
- for metric in current_trace.metrics:
1036
- res = _execute_metric(
1037
- metric=metric,
1038
- test_case=llm_test_case,
1039
- show_metric_indicator=show_metric_indicator,
1040
- in_component=True,
1041
- error_config=error_config,
1594
+ api_test_case = create_api_test_case(
1595
+ test_case=test_case,
1596
+ trace=trace_api,
1597
+ index=count if not _is_assert_test else None,
1042
1598
  )
1043
- if res == "skip":
1044
- continue
1045
1599
 
1046
- if not metric.skipped:
1047
- metric_data = create_metric_data(metric)
1048
- trace_api.metrics_data.append(metric_data)
1049
- api_test_case.update_metric_data(metric_data)
1050
- api_test_case.update_status(metric_data.success)
1051
- update_pbar(progress, pbar_eval_id)
1600
+ if test_run is not None:
1601
+ test_run_manager.set_test_run(test_run)
1602
+
1603
+ if api_test_case.success is None:
1604
+ api_test_case.update_status(False)
1605
+
1606
+ # try to update metric data
1607
+ if current_trace is not None:
1608
+ if current_trace.metrics:
1609
+ for m in current_trace.metrics:
1610
+ if getattr(m, "skipped", False):
1611
+ continue
1612
+ if id(m) in emitted_trace:
1613
+ continue
1614
+ api_test_case.update_metric_data(
1615
+ create_metric_data(m)
1616
+ )
1617
+
1618
+ # Finalize duration and persist
1619
+ elapsed = time.perf_counter() - start_time
1620
+ api_test_case.update_run_duration(
1621
+ elapsed if elapsed >= 0 else deadline
1622
+ )
1623
+
1624
+ if (
1625
+ api_test_case.metrics_data == []
1626
+ and api_test_case.trace is None
1627
+ ):
1628
+ api_test_case.metrics_data = None
1052
1629
 
1053
- # Then handle span-level metrics
1054
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1055
- end_time = time.perf_counter()
1056
- run_duration = end_time - start_time
1630
+ test_run_manager.update_test_run(api_test_case, test_case)
1631
+ test_results.append(create_test_result(api_test_case))
1632
+
1633
+ if trace_api is not None:
1634
+ test_results.extend(
1635
+ extract_trace_test_results(trace_api)
1636
+ )
1057
1637
 
1058
- # Update test run
1059
- api_test_case.update_run_duration(run_duration)
1060
- test_run_manager.update_test_run(api_test_case, test_case)
1061
- test_results.append(create_test_result(api_test_case))
1062
- test_results.extend(extract_trace_test_results(trace_api))
1638
+ missing = 2 - pbar_case_increments
1639
+ if missing > 0:
1640
+ update_pbar(progress, pbar_id, advance=missing)
1063
1641
 
1064
- update_pbar(progress, pbar_id)
1642
+ finally:
1643
+ # nothing to clean here, but keep symmetry with other paths
1644
+ pass
1065
1645
 
1066
1646
  if display_config.show_indicator and _use_bar_indicator:
1067
1647
  progress = Progress(
@@ -1102,7 +1682,10 @@ async def a_execute_agentic_test_cases(
1102
1682
 
1103
1683
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
1104
1684
  async with semaphore:
1105
- return await func(*args, **kwargs)
1685
+ timeout = _per_task_timeout()
1686
+ return await _await_with_outer_deadline(
1687
+ func, *args, timeout=timeout, **kwargs
1688
+ )
1106
1689
 
1107
1690
  test_run_manager = global_test_run_manager
1108
1691
  test_run_manager.save_to_disk = cache_config.write_cache
@@ -1149,7 +1732,19 @@ async def a_execute_agentic_test_cases(
1149
1732
  tasks.append(asyncio.create_task(task))
1150
1733
  await asyncio.sleep(async_config.throttle_value)
1151
1734
 
1152
- await asyncio.gather(*tasks)
1735
+ try:
1736
+ await asyncio.wait_for(
1737
+ asyncio.gather(*tasks),
1738
+ timeout=_gather_timeout(),
1739
+ )
1740
+ except (asyncio.TimeoutError, TimeoutError):
1741
+ # Cancel any still-pending tasks and drain them
1742
+ for t in tasks:
1743
+ if not t.done():
1744
+ t.cancel()
1745
+ await asyncio.gather(*tasks, return_exceptions=True)
1746
+ raise
1747
+
1153
1748
  else:
1154
1749
  for golden in goldens:
1155
1750
  with capture_evaluation_run("golden"):
@@ -1194,93 +1789,89 @@ async def _a_execute_agentic_test_case(
1194
1789
  progress: Optional[Progress] = None,
1195
1790
  pbar_id: Optional[int] = None,
1196
1791
  ):
1197
- if observed_callback:
1198
- total_tags = count_observe_decorators_in_module(observed_callback)
1199
- pbar_tags_id = add_pbar(
1200
- progress,
1201
- f" ⚡ Invoking observed callback (#{count})",
1202
- total=total_tags,
1203
- )
1792
+ test_start_time = time.perf_counter()
1793
+ current_trace = None
1794
+ trace_api = None
1795
+ test_case = None
1796
+ api_test_case = None
1797
+ try:
1798
+ if observed_callback:
1799
+ total_tags = count_observe_decorators_in_module(observed_callback)
1800
+ pbar_tags_id = add_pbar(
1801
+ progress,
1802
+ f" ⚡ Invoking observed callback (#{count})",
1803
+ total=total_tags,
1804
+ )
1204
1805
 
1205
- # Call callback and extract trace
1206
- with Observer(
1207
- "custom",
1208
- func_name="Test Wrapper",
1209
- _progress=progress,
1210
- _pbar_callback_id=pbar_tags_id,
1211
- ):
1212
- if asyncio.iscoroutinefunction(observed_callback):
1213
- await asyncio.wait_for(
1214
- observed_callback(golden.input),
1215
- timeout=_per_task_timeout(),
1216
- )
1217
- else:
1218
- observed_callback(golden.input)
1219
- current_trace: Trace = current_trace_context.get()
1806
+ # Call callback and extract trace
1807
+ with Observer(
1808
+ "custom",
1809
+ func_name="Test Wrapper",
1810
+ _progress=progress,
1811
+ _pbar_callback_id=pbar_tags_id,
1812
+ ):
1813
+ # get current_trace right away, we need it even if cancelled
1814
+ current_trace: Trace = current_trace_context.get()
1815
+ if asyncio.iscoroutinefunction(observed_callback):
1816
+ await _await_with_outer_deadline(
1817
+ observed_callback,
1818
+ golden.input,
1819
+ timeout=_per_task_timeout(),
1820
+ )
1821
+ else:
1822
+ observed_callback(golden.input)
1220
1823
 
1221
- update_pbar(progress, pbar_tags_id, advance=total_tags)
1222
- update_pbar(progress, pbar_id)
1824
+ update_pbar(progress, pbar_tags_id, advance=total_tags)
1825
+ update_pbar(progress, pbar_id)
1223
1826
 
1224
- elif trace:
1225
- current_trace = trace
1827
+ elif trace:
1828
+ current_trace = trace
1226
1829
 
1227
- if trace_metrics:
1228
- current_trace.metrics = trace_metrics
1830
+ trace_level_metrics_count = 0
1229
1831
 
1230
- # run evals through DFS
1231
- trace_api = create_api_trace(trace=current_trace, golden=golden)
1832
+ if trace_metrics:
1833
+ current_trace.metrics = trace_metrics
1232
1834
 
1233
- trace_level_metrics_count = (
1234
- len(current_trace.metrics) if current_trace.metrics else 0
1235
- )
1835
+ # run evals through DFS
1836
+ trace_api = create_api_trace(trace=current_trace, golden=golden)
1236
1837
 
1237
- pbar_eval_id = add_pbar(
1238
- progress,
1239
- f" 🎯 Evaluating component(s) (#{count})",
1240
- total=count_metrics_in_trace(trace=current_trace)
1241
- + trace_level_metrics_count,
1242
- )
1838
+ trace_level_metrics_count = (
1839
+ len(current_trace.metrics) if current_trace.metrics else 0
1840
+ )
1243
1841
 
1244
- test_case = LLMTestCase(
1245
- input=golden.input,
1246
- actual_output=(
1247
- str(current_trace.output)
1248
- if current_trace.output is not None
1249
- else None
1250
- ),
1251
- expected_output=current_trace.expected_output,
1252
- context=current_trace.context,
1253
- retrieval_context=current_trace.retrieval_context,
1254
- tools_called=current_trace.tools_called,
1255
- expected_tools=current_trace.expected_tools,
1256
- additional_metadata=golden.additional_metadata,
1257
- comments=golden.comments,
1258
- name=golden.name,
1259
- _dataset_alias=golden._dataset_alias,
1260
- _dataset_id=golden._dataset_id,
1261
- )
1262
- api_test_case = create_api_test_case(
1263
- test_case=test_case,
1264
- trace=trace_api,
1265
- index=count if not _is_assert_test else None,
1266
- )
1842
+ pbar_eval_id = add_pbar(
1843
+ progress,
1844
+ f" 🎯 Evaluating component(s) (#{count})",
1845
+ total=count_metrics_in_trace(trace=current_trace)
1846
+ + trace_level_metrics_count,
1847
+ )
1267
1848
 
1268
- await _a_execute_trace_test_case(
1269
- trace=trace,
1270
- trace_api=trace_api,
1271
- api_test_case=api_test_case,
1272
- ignore_errors=ignore_errors,
1273
- skip_on_missing_params=skip_on_missing_params,
1274
- show_indicator=show_indicator,
1275
- verbose_mode=verbose_mode,
1276
- progress=progress,
1277
- pbar_eval_id=pbar_eval_id,
1278
- _use_bar_indicator=_use_bar_indicator,
1279
- )
1849
+ test_case = LLMTestCase(
1850
+ input=golden.input,
1851
+ actual_output=(
1852
+ str(current_trace.output)
1853
+ if current_trace.output is not None
1854
+ else None
1855
+ ),
1856
+ expected_output=current_trace.expected_output,
1857
+ context=current_trace.context,
1858
+ retrieval_context=current_trace.retrieval_context,
1859
+ tools_called=current_trace.tools_called,
1860
+ expected_tools=current_trace.expected_tools,
1861
+ additional_metadata=golden.additional_metadata,
1862
+ comments=golden.comments,
1863
+ name=golden.name,
1864
+ _dataset_alias=golden._dataset_alias,
1865
+ _dataset_id=golden._dataset_id,
1866
+ )
1867
+ api_test_case = create_api_test_case(
1868
+ test_case=test_case,
1869
+ trace=trace_api,
1870
+ index=count if not _is_assert_test else None,
1871
+ )
1280
1872
 
1281
- async def dfs(span: BaseSpan):
1282
- await _a_execute_span_test_case(
1283
- span=span,
1873
+ await _a_execute_trace_test_case(
1874
+ trace=current_trace,
1284
1875
  trace_api=trace_api,
1285
1876
  api_test_case=api_test_case,
1286
1877
  ignore_errors=ignore_errors,
@@ -1289,39 +1880,155 @@ async def _a_execute_agentic_test_case(
1289
1880
  verbose_mode=verbose_mode,
1290
1881
  progress=progress,
1291
1882
  pbar_eval_id=pbar_eval_id,
1292
- test_run_manager=test_run_manager,
1293
1883
  _use_bar_indicator=_use_bar_indicator,
1294
1884
  )
1295
- child_tasks = [dfs(child) for child in span.children]
1296
- if child_tasks:
1297
- await asyncio.gather(*child_tasks)
1298
1885
 
1299
- test_start_time = time.perf_counter()
1300
- if current_trace and current_trace.root_spans:
1301
- await dfs(current_trace.root_spans[0])
1302
- else:
1303
- if (
1304
- logger.isEnabledFor(logging.DEBUG)
1305
- and get_settings().DEEPEVAL_VERBOSE_MODE
1306
- ):
1307
- logger.debug(
1308
- "Skipping DFS: empty trace or no root spans (trace=%s)",
1309
- current_trace.uuid if current_trace else None,
1886
+ async def dfs(trace: Trace, span: BaseSpan):
1887
+ await _a_execute_span_test_case(
1888
+ span=span,
1889
+ current_trace=trace,
1890
+ trace_api=trace_api,
1891
+ api_test_case=api_test_case,
1892
+ ignore_errors=ignore_errors,
1893
+ skip_on_missing_params=skip_on_missing_params,
1894
+ show_indicator=show_indicator,
1895
+ verbose_mode=verbose_mode,
1896
+ progress=progress,
1897
+ pbar_eval_id=pbar_eval_id,
1898
+ test_run_manager=test_run_manager,
1899
+ _use_bar_indicator=_use_bar_indicator,
1310
1900
  )
1311
1901
 
1312
- test_end_time = time.perf_counter()
1313
- run_duration = test_end_time - test_start_time
1902
+ if _skip_metrics_for_error(span=span, trace=trace):
1903
+ return
1904
+
1905
+ child_tasks = [
1906
+ asyncio.create_task(dfs(trace, child))
1907
+ for child in span.children
1908
+ ]
1909
+ if child_tasks:
1910
+ try:
1911
+ await asyncio.wait_for(
1912
+ asyncio.gather(*child_tasks),
1913
+ timeout=_gather_timeout(),
1914
+ )
1915
+ except (asyncio.TimeoutError, TimeoutError):
1916
+ for t in child_tasks:
1917
+ if not t.done():
1918
+ t.cancel()
1919
+ await asyncio.gather(*child_tasks, return_exceptions=True)
1920
+ raise
1921
+
1922
+ if not _skip_metrics_for_error(trace=current_trace):
1923
+ if current_trace and current_trace.root_spans:
1924
+ await dfs(current_trace, current_trace.root_spans[0])
1925
+ else:
1926
+ if (
1927
+ logger.isEnabledFor(logging.DEBUG)
1928
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1929
+ ):
1930
+ logger.debug(
1931
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1932
+ current_trace.uuid if current_trace else None,
1933
+ )
1934
+ except asyncio.CancelledError:
1935
+ # mark any unfinished metrics as cancelled
1936
+ cancel_msg = (
1937
+ "Timed out/cancelled while evaluating agentic test case. "
1938
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1939
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1940
+ )
1941
+
1942
+ if trace_metrics:
1943
+ for m in trace_metrics:
1944
+ if getattr(m, "skipped", False):
1945
+ continue
1946
+ if getattr(m, "success", None) is None and not getattr(
1947
+ m, "error", None
1948
+ ):
1949
+ m.success = False
1950
+ m.error = cancel_msg
1951
+
1952
+ if trace is not None and trace.metrics:
1953
+ for m in trace.metrics:
1954
+ if getattr(m, "skipped", False):
1955
+ continue
1956
+ if getattr(m, "success", None) is None and not getattr(
1957
+ m, "error", None
1958
+ ):
1959
+ m.success = False
1960
+ m.error = cancel_msg
1961
+ if not ignore_errors:
1962
+ raise
1963
+ finally:
1964
+ try:
1965
+ if api_test_case is None:
1966
+ if test_case is None:
1967
+ test_case = LLMTestCase(
1968
+ input=golden.input,
1969
+ actual_output=None,
1970
+ expected_output=None,
1971
+ context=None,
1972
+ retrieval_context=None,
1973
+ additional_metadata=golden.additional_metadata,
1974
+ tools_called=None,
1975
+ expected_tools=None,
1976
+ comments=golden.comments,
1977
+ name=golden.name,
1978
+ _dataset_alias=golden._dataset_alias,
1979
+ _dataset_id=golden._dataset_id,
1980
+ )
1981
+ if trace is not None and trace_api is None:
1982
+ trace_api = create_api_trace(trace, golden)
1314
1983
 
1315
- api_test_case.update_run_duration(run_duration)
1316
- test_run_manager.update_test_run(api_test_case, test_case)
1317
- test_results.append(create_test_result(api_test_case))
1318
- test_results.extend(extract_trace_test_results(trace_api))
1984
+ api_test_case = create_api_test_case(
1985
+ test_case=test_case,
1986
+ trace=trace_api,
1987
+ index=(count if not _is_assert_test else None),
1988
+ )
1319
1989
 
1320
- update_pbar(progress, pbar_id)
1990
+ # attach MetricData for any trace metrics we marked above
1991
+ if trace_metrics:
1992
+ for m in trace_metrics:
1993
+ if getattr(m, "skipped", False):
1994
+ continue
1995
+ api_test_case.update_metric_data(create_metric_data(m))
1996
+
1997
+ # If nothing set success yet, mark the case failed
1998
+ if api_test_case.success is None:
1999
+ api_test_case.update_status(False)
2000
+
2001
+ # test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
2002
+ # Set it to None to ensure the test_case is added
2003
+ if api_test_case.metrics_data == [] and api_test_case.trace is None:
2004
+ api_test_case.metrics_data = None
2005
+
2006
+ # Duration & persist
2007
+ test_end_time = time.perf_counter()
2008
+ run_duration = test_end_time - test_start_time
2009
+ api_test_case.update_run_duration(run_duration)
2010
+ test_run_manager.update_test_run(api_test_case, test_case)
2011
+
2012
+ # Build results and de-duplicate against trace results
2013
+ main_result = create_test_result(api_test_case)
2014
+ trace_results = (
2015
+ extract_trace_test_results(trace_api)
2016
+ if trace_api is not None
2017
+ else []
2018
+ )
2019
+ unique_trace_results = filter_duplicate_results(
2020
+ main_result, trace_results
2021
+ )
2022
+ test_results.append(main_result)
2023
+ test_results.extend(unique_trace_results)
2024
+ update_pbar(progress, pbar_id)
2025
+ finally:
2026
+ pass
1321
2027
 
1322
2028
 
1323
2029
  async def _a_execute_span_test_case(
1324
2030
  span: BaseSpan,
2031
+ current_trace: Trace,
1325
2032
  trace_api: TraceApi,
1326
2033
  api_test_case: LLMApiTestCase,
1327
2034
  ignore_errors: bool,
@@ -1346,12 +2053,22 @@ async def _a_execute_span_test_case(
1346
2053
  else:
1347
2054
  trace_api.base_spans.append(api_span)
1348
2055
 
1349
- if span.metrics is None:
2056
+ if _skip_metrics_for_error(span=span, trace=current_trace):
2057
+ api_span.status = TraceSpanApiStatus.ERRORED
2058
+ api_span.error = span.error or _trace_error(current_trace)
2059
+ if progress and pbar_eval_id is not None:
2060
+ update_pbar(
2061
+ progress,
2062
+ pbar_eval_id,
2063
+ advance=count_metrics_in_span_subtree(span),
2064
+ )
2065
+ return
2066
+
2067
+ metrics: List[BaseMetric] = list(span.metrics or [])
2068
+ if not metrics:
1350
2069
  return
1351
2070
 
1352
- has_task_completion = any(
1353
- isinstance(metric, TaskCompletionMetric) for metric in span.metrics
1354
- )
2071
+ requires_trace = any(metric.requires_trace for metric in metrics)
1355
2072
 
1356
2073
  llm_test_case = None
1357
2074
  if span.input:
@@ -1364,17 +2081,29 @@ async def _a_execute_span_test_case(
1364
2081
  tools_called=span.tools_called,
1365
2082
  expected_tools=span.expected_tools,
1366
2083
  )
1367
- if llm_test_case is None and not has_task_completion:
1368
- raise ValueError(
1369
- "Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
1370
- )
2084
+
2085
+ if not requires_trace:
2086
+ if llm_test_case is None:
2087
+ api_span.status = TraceSpanApiStatus.ERRORED
2088
+ api_span.error = format_error_text(
2089
+ DeepEvalError(
2090
+ "Span has metrics but no LLMTestCase. "
2091
+ "Are you sure you called `update_current_span()`?"
2092
+ )
2093
+ )
2094
+ if progress and pbar_eval_id is not None:
2095
+ update_pbar(
2096
+ progress,
2097
+ pbar_eval_id,
2098
+ advance=count_metrics_in_span_subtree(span),
2099
+ )
2100
+ return
1371
2101
 
1372
2102
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1373
- metrics: List[BaseMetric] = span.metrics
1374
2103
  test_case: Optional[LLMTestCase] = llm_test_case
1375
2104
 
1376
2105
  # add trace if task completion
1377
- if has_task_completion:
2106
+ if requires_trace:
1378
2107
  if test_case is None:
1379
2108
  test_case = LLMTestCase(input="None")
1380
2109
  test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
@@ -1418,12 +2147,22 @@ async def _a_execute_trace_test_case(
1418
2147
  pbar_eval_id: Optional[int],
1419
2148
  _use_bar_indicator: bool,
1420
2149
  ):
1421
- if trace.metrics is None:
2150
+
2151
+ if _skip_metrics_for_error(trace=trace):
2152
+ trace_api.status = TraceSpanApiStatus.ERRORED
2153
+ if progress and pbar_eval_id is not None:
2154
+ update_pbar(
2155
+ progress,
2156
+ pbar_eval_id,
2157
+ advance=count_total_metrics_for_trace(trace),
2158
+ )
1422
2159
  return
1423
2160
 
1424
- has_task_completion = any(
1425
- isinstance(metric, TaskCompletionMetric) for metric in trace.metrics
1426
- )
2161
+ metrics: List[BaseMetric] = list(trace.metrics or [])
2162
+ if not metrics:
2163
+ return
2164
+
2165
+ requires_trace = any(metric.requires_trace for metric in metrics)
1427
2166
 
1428
2167
  llm_test_case = None
1429
2168
  if trace.input:
@@ -1438,17 +2177,32 @@ async def _a_execute_trace_test_case(
1438
2177
  tools_called=trace.tools_called,
1439
2178
  expected_tools=trace.expected_tools,
1440
2179
  )
1441
- if llm_test_case is None and not has_task_completion:
1442
- raise ValueError(
1443
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
1444
- )
2180
+
2181
+ if not requires_trace:
2182
+ if llm_test_case is None:
2183
+ trace.status = TraceSpanStatus.ERRORED
2184
+ trace_api.status = TraceSpanApiStatus.ERRORED
2185
+ if trace.root_spans:
2186
+ trace.root_spans[0].status = TraceSpanStatus.ERRORED
2187
+ trace.root_spans[0].error = format_error_text(
2188
+ DeepEvalError(
2189
+ "Trace has metrics but no LLMTestCase (missing input/output). "
2190
+ "Are you sure you called `update_current_trace()`?"
2191
+ )
2192
+ )
2193
+ if progress and pbar_eval_id is not None:
2194
+ update_pbar(
2195
+ progress,
2196
+ pbar_eval_id,
2197
+ advance=count_total_metrics_for_trace(trace),
2198
+ )
2199
+ return
1445
2200
 
1446
2201
  show_metrics_indicator = show_indicator and not _use_bar_indicator
1447
- metrics: List[BaseMetric] = trace.metrics
1448
2202
  test_case: Optional[LLMTestCase] = llm_test_case
1449
2203
 
1450
2204
  # add trace if task completion
1451
- if has_task_completion:
2205
+ if requires_trace:
1452
2206
  if test_case is None:
1453
2207
  test_case = LLMTestCase(input="None")
1454
2208
  test_case._trace_dict = trace_manager.create_nested_spans_dict(
@@ -1578,11 +2332,12 @@ def execute_agentic_test_cases_from_loop(
1578
2332
  pbar_eval_id: Optional[int] = None,
1579
2333
  ):
1580
2334
  # Create API Span
1581
- metrics: List[BaseMetric] = span.metrics
2335
+ metrics: List[BaseMetric] = list(span.metrics or [])
1582
2336
 
1583
2337
  api_span: BaseApiSpan = (
1584
2338
  trace_manager._convert_span_to_api_span(span)
1585
2339
  )
2340
+
1586
2341
  if isinstance(span, AgentSpan):
1587
2342
  trace_api.agent_spans.append(api_span)
1588
2343
  elif isinstance(span, LlmSpan):
@@ -1595,9 +2350,30 @@ def execute_agentic_test_cases_from_loop(
1595
2350
  else:
1596
2351
  trace_api.base_spans.append(api_span)
1597
2352
 
2353
+ # Skip errored trace/span
2354
+ if _skip_metrics_for_error(span=span, trace=current_trace):
2355
+ api_span.status = TraceSpanApiStatus.ERRORED
2356
+ api_span.error = span.error or _trace_error(
2357
+ current_trace
2358
+ )
2359
+ if progress and pbar_eval_id is not None:
2360
+ update_pbar(
2361
+ progress,
2362
+ pbar_eval_id,
2363
+ advance=count_metrics_in_span_subtree(span),
2364
+ )
2365
+ return
2366
+
1598
2367
  for child in span.children:
1599
2368
  dfs(child, progress, pbar_eval_id)
1600
2369
 
2370
+ if not span.metrics:
2371
+ return
2372
+
2373
+ requires_trace = any(
2374
+ metric.requires_trace for metric in metrics
2375
+ )
2376
+
1601
2377
  llm_test_case = None
1602
2378
  if span.input is not None:
1603
2379
  llm_test_case = LLMTestCase(
@@ -1613,20 +2389,29 @@ def execute_agentic_test_cases_from_loop(
1613
2389
  tools_called=span.tools_called,
1614
2390
  expected_tools=span.expected_tools,
1615
2391
  )
1616
- if span.metrics is None or llm_test_case is None:
1617
- return
1618
2392
 
1619
- has_task_completion = any(
1620
- isinstance(metric, TaskCompletionMetric)
1621
- for metric in metrics
1622
- )
1623
-
1624
- if has_task_completion:
2393
+ if requires_trace:
1625
2394
  if llm_test_case is None:
1626
2395
  llm_test_case = LLMTestCase(input="None")
1627
2396
  llm_test_case._trace_dict = (
1628
2397
  trace_manager.create_nested_spans_dict(span)
1629
2398
  )
2399
+ else:
2400
+ if llm_test_case is None:
2401
+ api_span.status = TraceSpanApiStatus.ERRORED
2402
+ api_span.error = format_error_text(
2403
+ DeepEvalError(
2404
+ "Span has metrics but no LLMTestCase. "
2405
+ "Are you sure you called `update_current_span()`?"
2406
+ )
2407
+ )
2408
+ if progress and pbar_eval_id is not None:
2409
+ update_pbar(
2410
+ progress,
2411
+ pbar_eval_id,
2412
+ advance=count_metrics_in_span_subtree(span),
2413
+ )
2414
+ return
1630
2415
 
1631
2416
  # Preparing metric calculation
1632
2417
  api_span.metrics_data = []
@@ -1670,77 +2455,123 @@ def execute_agentic_test_cases_from_loop(
1670
2455
  start_time = time.perf_counter()
1671
2456
 
1672
2457
  # Handle trace-level metrics
1673
- if current_trace.metrics:
1674
- has_task_completion = any(
1675
- isinstance(metric, TaskCompletionMetric)
1676
- for metric in current_trace.metrics
1677
- )
1678
-
1679
- llm_test_case = None
1680
- if current_trace.input:
1681
- llm_test_case = LLMTestCase(
1682
- input=str(current_trace.input),
1683
- actual_output=(
1684
- str(current_trace.output)
1685
- if current_trace.output is not None
1686
- else None
2458
+ skip_metrics_for_this_golden = False
2459
+ if _skip_metrics_for_error(trace=current_trace):
2460
+ trace_api.status = TraceSpanApiStatus.ERRORED
2461
+ if progress and pbar_eval_id is not None:
2462
+ update_pbar(
2463
+ progress,
2464
+ pbar_eval_id,
2465
+ advance=count_total_metrics_for_trace(
2466
+ current_trace
1687
2467
  ),
1688
- expected_output=current_trace.expected_output,
1689
- context=current_trace.context,
1690
- retrieval_context=current_trace.retrieval_context,
1691
- tools_called=current_trace.tools_called,
1692
- expected_tools=current_trace.expected_tools,
1693
2468
  )
1694
- if llm_test_case is None and not has_task_completion:
1695
- raise ValueError(
1696
- "Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
2469
+ else:
2470
+ if current_trace.metrics:
2471
+ requires_trace = any(
2472
+ metric.requires_trace
2473
+ for metric in current_trace.metrics
1697
2474
  )
1698
2475
 
1699
- if has_task_completion:
1700
- if llm_test_case is None:
1701
- llm_test_case = LLMTestCase(input="None")
1702
- llm_test_case._trace_dict = (
1703
- trace_manager.create_nested_spans_dict(
1704
- current_trace.root_spans[0]
2476
+ llm_test_case = None
2477
+ if current_trace.input:
2478
+ llm_test_case = LLMTestCase(
2479
+ input=str(current_trace.input),
2480
+ actual_output=(
2481
+ str(current_trace.output)
2482
+ if current_trace.output is not None
2483
+ else None
2484
+ ),
2485
+ expected_output=current_trace.expected_output,
2486
+ context=current_trace.context,
2487
+ retrieval_context=current_trace.retrieval_context,
2488
+ tools_called=current_trace.tools_called,
2489
+ expected_tools=current_trace.expected_tools,
1705
2490
  )
1706
- )
1707
-
1708
- for metric in current_trace.metrics:
1709
- metric.skipped = False
1710
- metric.error = None
1711
- if display_config.verbose_mode is not None:
1712
- metric.verbose_mode = display_config.verbose_mode
1713
-
1714
- trace_api.metrics_data = []
1715
- for metric in current_trace.metrics:
1716
- res = _execute_metric(
1717
- metric=metric,
1718
- test_case=llm_test_case,
1719
- show_metric_indicator=show_metric_indicator,
1720
- in_component=True,
1721
- error_config=error_config,
1722
- )
1723
- if res == "skip":
1724
- continue
1725
-
1726
- if not metric.skipped:
1727
- metric_data = create_metric_data(metric)
1728
- trace_api.metrics_data.append(metric_data)
1729
- api_test_case.update_metric_data(metric_data)
1730
- api_test_case.update_status(metric_data.success)
1731
- update_pbar(progress, pbar_eval_id)
1732
-
1733
- # Then handle span-level metrics
1734
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1735
- end_time = time.perf_counter()
1736
- run_duration = end_time - start_time
1737
2491
 
1738
- # Update test run
1739
- api_test_case.update_run_duration(run_duration)
1740
- test_run_manager.update_test_run(api_test_case, test_case)
1741
- test_results.append(create_test_result(api_test_case))
2492
+ if requires_trace:
2493
+ if llm_test_case is None:
2494
+ llm_test_case = LLMTestCase(input="None")
2495
+ llm_test_case._trace_dict = (
2496
+ trace_manager.create_nested_spans_dict(
2497
+ current_trace.root_spans[0]
2498
+ )
2499
+ )
2500
+ else:
2501
+ if llm_test_case is None:
2502
+ current_trace.status = TraceSpanStatus.ERRORED
2503
+ trace_api.status = TraceSpanApiStatus.ERRORED
2504
+ if current_trace.root_spans:
2505
+ current_trace.root_spans[0].status = (
2506
+ TraceSpanStatus.ERRORED
2507
+ )
2508
+ current_trace.root_spans[0].error = (
2509
+ format_error_text(
2510
+ DeepEvalError(
2511
+ "Trace has metrics but no LLMTestCase (missing input/output). "
2512
+ "Are you sure you called `update_current_trace()`?"
2513
+ )
2514
+ )
2515
+ )
2516
+ if progress and pbar_eval_id is not None:
2517
+ update_pbar(
2518
+ progress,
2519
+ pbar_eval_id,
2520
+ advance=count_total_metrics_for_trace(
2521
+ current_trace
2522
+ ),
2523
+ )
2524
+ skip_metrics_for_this_golden = True
2525
+
2526
+ if not skip_metrics_for_this_golden:
2527
+ for metric in current_trace.metrics:
2528
+ metric.skipped = False
2529
+ metric.error = None
2530
+ if display_config.verbose_mode is not None:
2531
+ metric.verbose_mode = (
2532
+ display_config.verbose_mode
2533
+ )
2534
+
2535
+ trace_api.metrics_data = []
2536
+ for metric in current_trace.metrics:
2537
+ res = _execute_metric(
2538
+ metric=metric,
2539
+ test_case=llm_test_case,
2540
+ show_metric_indicator=show_metric_indicator,
2541
+ in_component=True,
2542
+ error_config=error_config,
2543
+ )
2544
+ if res == "skip":
2545
+ continue
2546
+
2547
+ if not metric.skipped:
2548
+ metric_data = create_metric_data(metric)
2549
+ trace_api.metrics_data.append(metric_data)
2550
+ api_test_case.update_metric_data(
2551
+ metric_data
2552
+ )
2553
+ api_test_case.update_status(
2554
+ metric_data.success
2555
+ )
2556
+ update_pbar(progress, pbar_eval_id)
2557
+
2558
+ # Then handle span-level metrics
2559
+ dfs(current_trace.root_spans[0], progress, pbar_eval_id)
2560
+
2561
+ end_time = time.perf_counter()
2562
+ run_duration = end_time - start_time
2563
+ # Update test run
2564
+ api_test_case.update_run_duration(run_duration)
2565
+ test_run_manager.update_test_run(api_test_case, test_case)
2566
+ main_result = create_test_result(api_test_case)
2567
+ trace_results = extract_trace_test_results(trace_api)
2568
+ unique_trace_results = filter_duplicate_results(
2569
+ main_result, trace_results
2570
+ )
2571
+ test_results.append(main_result)
2572
+ test_results.extend(unique_trace_results)
1742
2573
 
1743
- update_pbar(progress, pbar_id)
2574
+ update_pbar(progress, pbar_id)
1744
2575
 
1745
2576
  try:
1746
2577
  if display_config.show_indicator and _use_bar_indicator:
@@ -1798,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
1798
2629
 
1799
2630
  async def execute_callback_with_semaphore(coroutine: Awaitable):
1800
2631
  async with semaphore:
1801
- return await asyncio.wait_for(
1802
- coroutine, timeout=_per_task_timeout()
1803
- )
2632
+ timeout = _per_task_timeout()
2633
+ return await _await_with_outer_deadline(coroutine, timeout=timeout)
1804
2634
 
1805
2635
  def evaluate_test_cases(
1806
2636
  progress: Optional[Progress] = None,
@@ -1841,39 +2671,146 @@ def a_execute_agentic_test_cases_from_loop(
1841
2671
  }
1842
2672
 
1843
2673
  def on_task_done(t: asyncio.Task):
2674
+ cancelled = False
2675
+ exc = None
2676
+ trace = None
2677
+ root = None
2678
+ resolved_trace_from_task = False
2679
+ resolved_root_from_task = False
2680
+
2681
+ # Task.exception() raises CancelledError if task was cancelled
2682
+ try:
2683
+ exc = t.exception()
2684
+ except asyncio.CancelledError:
2685
+ cancelled = True
2686
+ exc = None
2687
+
2688
+ meta = task_meta.get(t, {})
2689
+ golden_index = meta.get("golden_index")
2690
+
2691
+ if golden_index is not None and 0 <= golden_index < len(
2692
+ goldens
2693
+ ):
2694
+ golden = goldens[golden_index]
2695
+
2696
+ def _mark_trace_error(trace, root, msg: str):
2697
+ now = time.perf_counter()
2698
+ trace.status = TraceSpanStatus.ERRORED
2699
+ # Close the trace so the API layer has a proper endTime
2700
+ if trace.end_time is None:
2701
+ trace.end_time = now
2702
+ if root:
2703
+ root.status = TraceSpanStatus.ERRORED
2704
+ root.error = msg
2705
+ if root.end_time is None:
2706
+ root.end_time = now
2707
+
2708
+ if exc is not None:
2709
+ msg = format_error_text(exc)
2710
+ trace, root = _resolve_trace_and_root_for_task(t)
2711
+ resolved_trace_from_task = bool(trace)
2712
+ resolved_root_from_task = bool(root)
2713
+ if trace:
2714
+ _mark_trace_error(trace, root, msg)
2715
+ else:
2716
+ for (
2717
+ trace
2718
+ ) in trace_manager.integration_traces_to_evaluate:
2719
+ if (
2720
+ trace_manager.trace_uuid_to_golden.get(
2721
+ trace.uuid
2722
+ )
2723
+ is golden
2724
+ ):
2725
+ root = _pick_root_for_marking(trace)
2726
+ _mark_trace_error(trace, root, msg)
2727
+ break
2728
+
2729
+ elif cancelled or t.cancelled():
2730
+ cancel_exc = DeepEvalError(
2731
+ "Task was cancelled (likely due to timeout)."
2732
+ )
2733
+ msg = format_error_text(cancel_exc)
2734
+ trace, root = _resolve_trace_and_root_for_task(t)
2735
+ resolved_trace_from_task = bool(trace)
2736
+ resolved_root_from_task = bool(root)
2737
+ if trace:
2738
+ _mark_trace_error(trace, root, msg)
2739
+ else:
2740
+ for (
2741
+ trace
2742
+ ) in trace_manager.integration_traces_to_evaluate:
2743
+ if (
2744
+ trace_manager.trace_uuid_to_golden.get(
2745
+ trace.uuid
2746
+ )
2747
+ is golden
2748
+ ):
2749
+ root = _pick_root_for_marking(trace)
2750
+ _mark_trace_error(trace, root, msg)
2751
+ break
2752
+
1844
2753
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1845
2754
  # Using info level here to make it easy to spot these logs.
1846
- # We are gated by DEEPEVAL_DEBUG_ASYNC
1847
- meta = task_meta.get(t, {})
2755
+ golden_name = meta.get("golden_name")
1848
2756
  duration = time.perf_counter() - meta.get(
1849
2757
  "started", started
1850
2758
  )
1851
2759
 
1852
- if t.cancelled():
2760
+ if cancelled or exc is not None:
2761
+ if not resolved_trace_from_task:
2762
+ logger.warning(
2763
+ "[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
2764
+ t.get_name(),
2765
+ golden_name,
2766
+ )
2767
+ elif not resolved_root_from_task:
2768
+ logger.warning(
2769
+ "[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
2770
+ t.get_name(),
2771
+ trace.uuid,
2772
+ )
2773
+
2774
+ if cancelled:
1853
2775
  logger.info(
1854
2776
  "[deepeval] task CANCELLED %s after %.2fs meta=%r",
1855
2777
  t.get_name(),
1856
2778
  duration,
1857
2779
  meta,
1858
2780
  )
1859
- else:
1860
- exc = t.exception()
1861
- if exc is not None:
1862
- logger.error(
1863
- "[deepeval] task ERROR %s after %.2fs meta=%r",
1864
- t.get_name(),
1865
- duration,
1866
- meta,
1867
- exc_info=(type(exc), exc, exc.__traceback__),
1868
- )
1869
- else:
1870
- logger.info(
1871
- "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
1872
- t.get_name(),
1873
- duration,
1874
- meta.get("golden_index"),
2781
+ elif exc is not None:
2782
+
2783
+ show_trace = bool(
2784
+ get_settings().DEEPEVAL_LOG_STACK_TRACES
2785
+ )
2786
+ exc_info = (
2787
+ (
2788
+ type(exc),
2789
+ exc,
2790
+ getattr(exc, "__traceback__", None),
1875
2791
  )
2792
+ if show_trace
2793
+ else None
2794
+ )
2795
+ logger.error(
2796
+ "[deepeval] task ERROR %s after %.2fs meta=%r",
2797
+ t.get_name(),
2798
+ duration,
2799
+ meta,
2800
+ exc_info=exc_info,
2801
+ )
2802
+ else:
2803
+ logger.info(
2804
+ "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
2805
+ t.get_name(),
2806
+ duration,
2807
+ meta.get("golden_index"),
2808
+ )
1876
2809
 
2810
+ try:
2811
+ trace_manager.task_bindings.pop(t, None)
2812
+ except Exception:
2813
+ pass
1877
2814
  update_pbar(progress, pbar_callback_id)
1878
2815
  update_pbar(progress, pbar_id)
1879
2816
 
@@ -1918,7 +2855,8 @@ def a_execute_agentic_test_cases_from_loop(
1918
2855
  timeout=_gather_timeout(),
1919
2856
  )
1920
2857
  )
1921
- except asyncio.TimeoutError:
2858
+
2859
+ except (asyncio.TimeoutError, TimeoutError):
1922
2860
  import traceback
1923
2861
 
1924
2862
  pending = [t for t in created_tasks if not t.done()]
@@ -1987,10 +2925,11 @@ def a_execute_agentic_test_cases_from_loop(
1987
2925
  ]
1988
2926
 
1989
2927
  if get_settings().DEEPEVAL_DEBUG_ASYNC:
1990
- logger.warning(
1991
- "[deepeval] %d stray task(s) not tracked; cancelling...",
1992
- len(leftovers),
1993
- )
2928
+ if len(leftovers) > 0:
2929
+ logger.warning(
2930
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
2931
+ len(leftovers),
2932
+ )
1994
2933
  for t in leftovers:
1995
2934
  meta = task_meta.get(t, {})
1996
2935
  name = t.get_name()
@@ -2130,7 +3069,10 @@ async def _a_evaluate_traces(
2130
3069
 
2131
3070
  async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
2132
3071
  async with semaphore:
2133
- return await func(*args, **kwargs)
3072
+ timeout = _per_task_timeout()
3073
+ return await _await_with_outer_deadline(
3074
+ func, *args, timeout=timeout, **kwargs
3075
+ )
2134
3076
 
2135
3077
  eval_tasks = []
2136
3078
  # Here, we will work off a fixed-set copy to avoid surprises from potential
@@ -2173,7 +3115,18 @@ async def _a_evaluate_traces(
2173
3115
  )
2174
3116
  eval_tasks.append(asyncio.create_task(task))
2175
3117
  await asyncio.sleep(throttle_value)
2176
- await asyncio.gather(*eval_tasks)
3118
+
3119
+ try:
3120
+ await asyncio.wait_for(
3121
+ asyncio.gather(*eval_tasks),
3122
+ timeout=_gather_timeout(),
3123
+ )
3124
+ except (asyncio.TimeoutError, TimeoutError):
3125
+ for t in eval_tasks:
3126
+ if not t.done():
3127
+ t.cancel()
3128
+ await asyncio.gather(*eval_tasks, return_exceptions=True)
3129
+ raise
2177
3130
 
2178
3131
 
2179
3132
  async def _evaluate_test_case_pairs(
@@ -2196,7 +3149,10 @@ async def _evaluate_test_case_pairs(
2196
3149
 
2197
3150
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
2198
3151
  async with semaphore:
2199
- return await func(*args, **kwargs)
3152
+ timeout = _per_task_timeout()
3153
+ return await _await_with_outer_deadline(
3154
+ func, *args, timeout=timeout, **kwargs
3155
+ )
2200
3156
 
2201
3157
  tasks = []
2202
3158
  for count, test_case_pair in enumerate(test_case_pairs):
@@ -2229,7 +3185,19 @@ async def _evaluate_test_case_pairs(
2229
3185
  )
2230
3186
  tasks.append(asyncio.create_task(task))
2231
3187
  await asyncio.sleep(throttle_value)
2232
- await asyncio.gather(*tasks)
3188
+
3189
+ try:
3190
+ await asyncio.wait_for(
3191
+ asyncio.gather(*tasks),
3192
+ timeout=_gather_timeout(),
3193
+ )
3194
+ except (asyncio.TimeoutError, TimeoutError):
3195
+ # Cancel any still-pending tasks and drain them
3196
+ for t in tasks:
3197
+ if not t.done():
3198
+ t.cancel()
3199
+ await asyncio.gather(*tasks, return_exceptions=True)
3200
+ raise
2233
3201
 
2234
3202
 
2235
3203
  def _execute_metric(
@@ -2248,10 +3216,13 @@ def _execute_metric(
2248
3216
  )
2249
3217
  except MissingTestCaseParamsError as e:
2250
3218
  if error_config.skip_on_missing_params:
3219
+ metric.skipped = True
3220
+ metric.error = None
3221
+ metric.success = None
2251
3222
  return "skip"
2252
3223
  else:
2253
3224
  if error_config.ignore_errors:
2254
- metric.error = str(e)
3225
+ metric.error = format_error_text(e)
2255
3226
  metric.success = False
2256
3227
  else:
2257
3228
  raise
@@ -2260,22 +3231,25 @@ def _execute_metric(
2260
3231
  metric.measure(test_case)
2261
3232
  except MissingTestCaseParamsError as e:
2262
3233
  if error_config.skip_on_missing_params:
3234
+ metric.skipped = True
3235
+ metric.error = None
3236
+ metric.success = None
2263
3237
  return "skip"
2264
3238
  else:
2265
3239
  if error_config.ignore_errors:
2266
- metric.error = str(e)
3240
+ metric.error = format_error_text(e)
2267
3241
  metric.success = False
2268
3242
  else:
2269
3243
  raise
2270
3244
  except Exception as e:
2271
3245
  if error_config.ignore_errors:
2272
- metric.error = str(e)
3246
+ metric.error = format_error_text(e)
2273
3247
  metric.success = False
2274
3248
  else:
2275
3249
  raise
2276
3250
  except Exception as e:
2277
3251
  if error_config.ignore_errors:
2278
- metric.error = str(e)
3252
+ metric.error = format_error_text(e)
2279
3253
  metric.success = False
2280
3254
  else:
2281
3255
  raise