deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +258 -47
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/config/utils.py +5 -0
  12. deepeval/dataset/dataset.py +162 -30
  13. deepeval/dataset/utils.py +41 -13
  14. deepeval/evaluate/execute.py +1099 -633
  15. deepeval/integrations/crewai/handler.py +36 -0
  16. deepeval/integrations/langchain/callback.py +27 -2
  17. deepeval/integrations/llama_index/handler.py +58 -4
  18. deepeval/integrations/llama_index/utils.py +24 -0
  19. deepeval/metrics/__init__.py +5 -0
  20. deepeval/metrics/exact_match/__init__.py +0 -0
  21. deepeval/metrics/exact_match/exact_match.py +94 -0
  22. deepeval/metrics/indicator.py +21 -1
  23. deepeval/metrics/pattern_match/__init__.py +0 -0
  24. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  25. deepeval/metrics/task_completion/task_completion.py +9 -2
  26. deepeval/model_integrations/__init__.py +0 -0
  27. deepeval/model_integrations/utils.py +116 -0
  28. deepeval/models/base_model.py +3 -1
  29. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  30. deepeval/models/llms/openai_model.py +10 -1
  31. deepeval/models/retry_policy.py +103 -20
  32. deepeval/openai/__init__.py +3 -1
  33. deepeval/openai/extractors.py +2 -2
  34. deepeval/openai/utils.py +7 -31
  35. deepeval/prompt/api.py +11 -10
  36. deepeval/prompt/prompt.py +5 -4
  37. deepeval/simulator/conversation_simulator.py +25 -18
  38. deepeval/synthesizer/chunking/context_generator.py +9 -1
  39. deepeval/telemetry.py +3 -3
  40. deepeval/test_case/llm_test_case.py +3 -2
  41. deepeval/test_run/api.py +3 -2
  42. deepeval/test_run/cache.py +4 -3
  43. deepeval/test_run/test_run.py +24 -5
  44. deepeval/tracing/api.py +11 -10
  45. deepeval/tracing/otel/exporter.py +11 -0
  46. deepeval/tracing/patchers.py +102 -1
  47. deepeval/tracing/trace_context.py +13 -4
  48. deepeval/tracing/tracing.py +10 -1
  49. deepeval/tracing/types.py +8 -8
  50. deepeval/tracing/utils.py +9 -0
  51. deepeval/utils.py +44 -2
  52. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  53. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
  54. /deepeval/{openai → model_integrations}/types.py +0 -0
  55. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  57. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
 
3
4
  from rich.progress import (
@@ -56,10 +57,16 @@ from deepeval.metrics import (
56
57
  BaseMetric,
57
58
  BaseConversationalMetric,
58
59
  BaseMultimodalMetric,
60
+ TaskCompletionMetric,
59
61
  )
60
62
  from deepeval.metrics.indicator import (
61
63
  measure_metrics_with_indicator,
62
64
  )
65
+ from deepeval.models.retry_policy import (
66
+ set_outer_deadline,
67
+ reset_outer_deadline,
68
+ run_sync_with_timeout,
69
+ )
63
70
  from deepeval.test_case import (
64
71
  LLMTestCase,
65
72
  ConversationalTestCase,
@@ -238,6 +245,18 @@ def filter_duplicate_results(
238
245
  ]
239
246
 
240
247
 
248
+ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
249
+ token = set_outer_deadline(timeout)
250
+ try:
251
+ if inspect.isawaitable(obj):
252
+ coro = obj
253
+ else:
254
+ coro = obj(*args, **kwargs)
255
+ return await asyncio.wait_for(coro, timeout=timeout)
256
+ finally:
257
+ reset_outer_deadline(token)
258
+
259
+
241
260
  ###########################################
242
261
  ### E2E Evals #############################
243
262
  ###########################################
@@ -269,6 +288,13 @@ def execute_test_cases(
269
288
 
270
289
  test_run_manager.save_to_disk = cache_config.write_cache
271
290
  test_run = test_run_manager.get_test_run(identifier=identifier)
291
+ if test_run is None:
292
+ # ensure we have a test_run ( in case it couldn't be loaded from disk )
293
+ test_run_manager.create_test_run(identifier=identifier)
294
+ test_run = test_run_manager.get_test_run(identifier=identifier)
295
+
296
+ # capture once for inner closures
297
+ hyperparameters = test_run.hyperparameters if test_run is not None else None
272
298
 
273
299
  if display_config.verbose_mode is not None:
274
300
  for metric in metrics:
@@ -289,176 +315,228 @@ def execute_test_cases(
289
315
  test_results: List[TestResult] = []
290
316
 
291
317
  def evaluate_test_cases(
292
- progress: Optional[Progress] = None, pbar_id: Optional[str] = None
318
+ progress: Optional[Progress] = None, pbar_id: Optional[int] = None
293
319
  ):
294
320
  llm_test_case_count = -1
321
+ mllm_test_case_count = -1
295
322
  conversational_test_case_count = -1
296
323
  show_metric_indicator = (
297
324
  display_config.show_indicator and not _use_bar_indicator
298
325
  )
299
326
  for i, test_case in enumerate(test_cases):
327
+ # skip what we know we won't run
328
+ if isinstance(test_case, LLMTestCase):
329
+ if not llm_metrics:
330
+ update_pbar(progress, pbar_id)
331
+ continue
332
+ per_case_total = len(llm_metrics)
333
+ elif isinstance(test_case, MLLMTestCase):
334
+ if not mllm_metrics:
335
+ update_pbar(progress, pbar_id)
336
+ continue
337
+ per_case_total = len(mllm_metrics)
338
+ elif isinstance(test_case, ConversationalTestCase):
339
+ if not conversational_metrics:
340
+ update_pbar(progress, pbar_id)
341
+ continue
342
+ per_case_total = len(conversational_metrics)
343
+
300
344
  pbar_test_case_id = add_pbar(
301
345
  progress,
302
346
  f" 🎯 Evaluating test case #{i}",
303
- total=len(metrics),
347
+ total=per_case_total,
304
348
  )
305
- with capture_evaluation_run("test case"):
306
- for metric in metrics:
307
- metric.error = None # Reset metric error
308
349
 
309
- if isinstance(test_case, LLMTestCase):
310
- if len(llm_metrics) == 0:
311
- continue
312
-
313
- llm_test_case_count += 1
314
- cached_test_case = None
315
- if cache_config.use_cache:
316
- cached_test_case = (
317
- global_test_run_cache_manager.get_cached_test_case(
318
- test_case, test_run.hyperparameters
319
- )
320
- )
321
-
322
- ##### Metric Calculation #####
323
- api_test_case: LLMApiTestCase = create_api_test_case(
324
- test_case=test_case, index=llm_test_case_count
350
+ metrics_for_case = (
351
+ llm_metrics
352
+ if isinstance(test_case, LLMTestCase)
353
+ else (
354
+ mllm_metrics
355
+ if isinstance(test_case, MLLMTestCase)
356
+ else conversational_metrics
357
+ )
358
+ )
359
+ api_test_case = create_api_test_case(
360
+ test_case=test_case,
361
+ index=(
362
+ llm_test_case_count + 1
363
+ if isinstance(test_case, LLMTestCase)
364
+ else (
365
+ mllm_test_case_count + 1
366
+ if isinstance(test_case, MLLMTestCase)
367
+ else conversational_test_case_count + 1
325
368
  )
326
- new_cached_test_case: CachedTestCase = CachedTestCase()
327
-
328
- test_start_time = time.perf_counter()
329
- read_all_metrics_from_cache = True
330
- for metric in llm_metrics:
331
- metric_data = None
332
- if cached_test_case is not None:
333
- cached_metric_data = Cache.get_metric_data(
334
- metric, cached_test_case
335
- )
336
- if cached_metric_data:
337
- metric_data = cached_metric_data.metric_data
338
-
339
- if metric_data is None:
340
- read_all_metrics_from_cache = False
341
- res = _execute_metric(
342
- metric=metric,
343
- test_case=test_case,
344
- show_metric_indicator=show_metric_indicator,
345
- in_component=False,
346
- error_config=error_config,
347
- )
348
- if res == "skip":
349
- continue
350
- metric_data = create_metric_data(metric)
351
-
352
- # here, we will check for an additional property on the flattened test cases to see if updating is necessary
353
- api_test_case.update_metric_data(metric_data)
354
- if metric.error is None:
355
- cache_metric_data = deepcopy(metric_data)
356
- cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
357
- updated_cached_metric_data = CachedMetricData(
358
- metric_data=cache_metric_data,
359
- metric_configuration=Cache.create_metric_configuration(
360
- metric
361
- ),
362
- )
363
- new_cached_test_case.cached_metrics_data.append(
364
- updated_cached_metric_data
365
- )
366
- update_pbar(progress, pbar_test_case_id)
369
+ ),
370
+ )
371
+ emitted = [False] * len(metrics_for_case)
372
+ index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
373
+ current_index = -1
374
+ start_time = time.perf_counter()
375
+ deadline_timeout = _per_task_timeout()
376
+ deadline_token = set_outer_deadline(deadline_timeout)
377
+ new_cached_test_case: CachedTestCase = None
378
+ try:
367
379
 
368
- test_end_time = time.perf_counter()
369
- if read_all_metrics_from_cache:
370
- run_duration = 0
371
- else:
372
- run_duration = test_end_time - test_start_time
373
- api_test_case.update_run_duration(run_duration)
380
+ def _run_case():
381
+ nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
382
+ with capture_evaluation_run("test case"):
383
+ for metric in metrics:
384
+ metric.error = None # Reset metric error
385
+
386
+ if isinstance(test_case, LLMTestCase):
387
+ llm_test_case_count += 1
388
+ cached_test_case = None
389
+ if cache_config.use_cache:
390
+ cached_test_case = global_test_run_cache_manager.get_cached_test_case(
391
+ test_case, hyperparameters
392
+ )
374
393
 
375
- ### Update Test Run ###
376
- test_run_manager.update_test_run(api_test_case, test_case)
394
+ ##### Metric Calculation #####
395
+ new_cached_test_case = CachedTestCase()
377
396
 
378
- ### Cache Test Run ###
379
- global_test_run_cache_manager.cache_test_case(
380
- test_case,
381
- new_cached_test_case,
382
- test_run.hyperparameters,
383
- )
384
- global_test_run_cache_manager.cache_test_case(
385
- test_case,
386
- new_cached_test_case,
387
- test_run.hyperparameters,
388
- to_temp=True,
389
- )
397
+ for metric in llm_metrics:
398
+ current_index = index_of[id(metric)]
399
+ metric_data = None
400
+ if cached_test_case is not None:
401
+ cached_metric_data = Cache.get_metric_data(
402
+ metric, cached_test_case
403
+ )
404
+ if cached_metric_data:
405
+ metric_data = (
406
+ cached_metric_data.metric_data
407
+ )
390
408
 
391
- # No caching and not sending test cases to Confident AI for multimodal metrics yet
392
- elif isinstance(test_case, MLLMTestCase):
393
- if len(mllm_metrics) == 0:
394
- continue
409
+ if metric_data is None:
410
+ res = _execute_metric(
411
+ metric=metric,
412
+ test_case=test_case,
413
+ show_metric_indicator=show_metric_indicator,
414
+ in_component=False,
415
+ error_config=error_config,
416
+ )
417
+ if res == "skip":
418
+ continue
419
+ metric_data = create_metric_data(metric)
395
420
 
396
- api_test_case: LLMApiTestCase = create_api_test_case(
397
- test_case=test_case, index=llm_test_case_count
398
- )
399
- test_start_time = time.perf_counter()
400
- for metric in mllm_metrics:
401
- res = _execute_metric(
402
- metric=metric,
403
- test_case=test_case,
404
- show_metric_indicator=show_metric_indicator,
405
- in_component=False,
406
- error_config=error_config,
407
- )
408
- if res == "skip":
409
- continue
421
+ # here, we will check for an additional property on the flattened test cases to see if updating is necessary
422
+ api_test_case.update_metric_data(metric_data)
423
+ emitted[current_index] = True
424
+ if metric.error is None:
425
+ cache_metric_data = deepcopy(metric_data)
426
+ cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
427
+ updated_cached_metric_data = CachedMetricData(
428
+ metric_data=cache_metric_data,
429
+ metric_configuration=Cache.create_metric_configuration(
430
+ metric
431
+ ),
432
+ )
433
+ new_cached_test_case.cached_metrics_data.append(
434
+ updated_cached_metric_data
435
+ )
436
+ update_pbar(progress, pbar_test_case_id)
410
437
 
411
- metric_data = create_metric_data(metric)
412
- api_test_case.update_metric_data(metric_data)
413
- update_pbar(progress, pbar_test_case_id)
438
+ # No caching and not sending test cases to Confident AI for multimodal metrics yet
439
+ elif isinstance(test_case, MLLMTestCase):
440
+ mllm_test_case_count += 1
441
+ for metric in mllm_metrics:
442
+ current_index = index_of[id(metric)]
443
+ res = _execute_metric(
444
+ metric=metric,
445
+ test_case=test_case,
446
+ show_metric_indicator=show_metric_indicator,
447
+ in_component=False,
448
+ error_config=error_config,
449
+ )
450
+ if res == "skip":
451
+ continue
414
452
 
415
- test_end_time = time.perf_counter()
416
- if len(mllm_metrics) > 0:
417
- run_duration = test_end_time - test_start_time
418
- api_test_case.update_run_duration(run_duration)
453
+ metric_data = create_metric_data(metric)
454
+ api_test_case.update_metric_data(metric_data)
455
+ emitted[current_index] = True
456
+ update_pbar(progress, pbar_test_case_id)
419
457
 
420
- ### Update Test Run ###
421
- test_run_manager.update_test_run(api_test_case, test_case)
458
+ # No caching for conversational metrics yet
459
+ elif isinstance(test_case, ConversationalTestCase):
460
+ conversational_test_case_count += 1
461
+ for metric in conversational_metrics:
462
+ current_index = index_of[id(metric)]
463
+ res = _execute_metric(
464
+ metric=metric,
465
+ test_case=test_case,
466
+ show_metric_indicator=show_metric_indicator,
467
+ in_component=False,
468
+ error_config=error_config,
469
+ )
470
+ if res == "skip":
471
+ continue
422
472
 
423
- # No caching for conversational metrics yet
424
- elif isinstance(test_case, ConversationalTestCase):
425
- if len(metrics) == 0:
473
+ metric_data = create_metric_data(metric)
474
+ api_test_case.update_metric_data(metric_data)
475
+ emitted[current_index] = True
476
+ update_pbar(progress, pbar_test_case_id)
477
+
478
+ run_sync_with_timeout(_run_case, deadline_timeout)
479
+ except (asyncio.TimeoutError, TimeoutError):
480
+ msg = (
481
+ f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
482
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
483
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
484
+ )
485
+ for i, m in enumerate(metrics_for_case):
486
+ if getattr(m, "skipped", False):
487
+ continue
488
+ # already finished or errored? leave it
489
+ if getattr(m, "success", None) is not None or getattr(
490
+ m, "error", None
491
+ ):
426
492
  continue
493
+ if i == current_index:
494
+ m.success = False
495
+ m.error = msg
496
+ elif i > current_index:
497
+ m.success = False
498
+ m.error = "Skipped due to case timeout."
499
+
500
+ if not error_config.ignore_errors:
501
+ raise
427
502
 
428
- conversational_test_case_count += 1
429
- api_test_case: ConversationalApiTestCase = (
430
- create_api_test_case(
431
- test_case=test_case,
432
- index=conversational_test_case_count,
503
+ finally:
504
+ try:
505
+ if (
506
+ isinstance(test_case, LLMTestCase)
507
+ and new_cached_test_case is not None
508
+ ):
509
+ ### Cache Test Run ###
510
+ global_test_run_cache_manager.cache_test_case(
511
+ test_case,
512
+ new_cached_test_case,
513
+ hyperparameters,
433
514
  )
434
- )
435
-
436
- test_start_time = time.perf_counter()
437
- for metric in metrics:
438
- res = _execute_metric(
439
- metric=metric,
440
- test_case=test_case,
441
- show_metric_indicator=show_metric_indicator,
442
- in_component=False,
443
- error_config=error_config,
515
+ global_test_run_cache_manager.cache_test_case(
516
+ test_case,
517
+ new_cached_test_case,
518
+ hyperparameters,
519
+ to_temp=True,
444
520
  )
445
- if res == "skip":
446
- continue
447
-
448
- metric_data = create_metric_data(metric)
449
- api_test_case.update_metric_data(metric_data)
450
- update_pbar(progress, pbar_test_case_id)
451
521
 
452
- test_end_time = time.perf_counter()
453
- run_duration = test_end_time - test_start_time
454
- api_test_case.update_run_duration(run_duration)
522
+ # Attach MetricData for *all* metrics (finished or synthesized)
523
+ for i, m in enumerate(metrics_for_case):
524
+ if getattr(m, "skipped", False):
525
+ continue
526
+ if not emitted[i]:
527
+ api_test_case.update_metric_data(
528
+ create_metric_data(m)
529
+ )
455
530
 
456
- ### Update Test Run ###
531
+ elapsed = time.perf_counter() - start_time
532
+ api_test_case.update_run_duration(
533
+ elapsed if elapsed >= 0 else deadline_timeout
534
+ )
457
535
  test_run_manager.update_test_run(api_test_case, test_case)
458
-
459
- test_result = create_test_result(api_test_case)
460
- test_results.append(test_result)
461
- update_pbar(progress, pbar_id)
536
+ test_results.append(create_test_result(api_test_case))
537
+ update_pbar(progress, pbar_id)
538
+ finally:
539
+ reset_outer_deadline(deadline_token)
462
540
 
463
541
  if display_config.show_indicator and _use_bar_indicator:
464
542
  progress = Progress(
@@ -503,9 +581,9 @@ async def a_execute_test_cases(
503
581
 
504
582
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
505
583
  async with semaphore:
506
- return await asyncio.wait_for(
507
- func(*args, **kwargs),
508
- timeout=_per_task_timeout(),
584
+ timeout = _per_task_timeout()
585
+ return await _await_with_outer_deadline(
586
+ func, *args, timeout=timeout, **kwargs
509
587
  )
510
588
 
511
589
  global_test_run_cache_manager.disable_write_cache = (
@@ -609,7 +687,7 @@ async def a_execute_test_cases(
609
687
 
610
688
  task = execute_with_semaphore(
611
689
  func=_a_execute_conversational_test_cases,
612
- metrics=copy_metrics(metrics),
690
+ metrics=copy_metrics(conversational_metrics),
613
691
  test_case=test_case,
614
692
  test_run_manager=test_run_manager,
615
693
  test_results=test_results,
@@ -631,13 +709,15 @@ async def a_execute_test_cases(
631
709
  asyncio.gather(*tasks),
632
710
  timeout=_gather_timeout(),
633
711
  )
634
- except asyncio.TimeoutError:
635
- # Cancel any still-pending tasks and drain them
712
+ except (asyncio.TimeoutError, TimeoutError):
636
713
  for t in tasks:
637
714
  if not t.done():
638
715
  t.cancel()
639
716
  await asyncio.gather(*tasks, return_exceptions=True)
640
- raise
717
+ logging.getLogger("deepeval").error(
718
+ "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
719
+ _gather_timeout(),
720
+ )
641
721
 
642
722
  else:
643
723
  for test_case in test_cases:
@@ -717,7 +797,7 @@ async def a_execute_test_cases(
717
797
  asyncio.gather(*tasks),
718
798
  timeout=_gather_timeout(),
719
799
  )
720
- except asyncio.TimeoutError:
800
+ except (asyncio.TimeoutError, TimeoutError):
721
801
  # Cancel any still-pending tasks and drain them
722
802
  for t in tasks:
723
803
  if not t.done():
@@ -744,6 +824,7 @@ async def _a_execute_llm_test_cases(
744
824
  progress: Optional[Progress] = None,
745
825
  pbar_id: Optional[int] = None,
746
826
  ):
827
+ logger.info("in _a_execute_llm_test_cases")
747
828
  pbar_test_case_id = add_pbar(
748
829
  progress,
749
830
  f" 🎯 Evaluating test case #{count}",
@@ -767,64 +848,85 @@ async def _a_execute_llm_test_cases(
767
848
  api_test_case = create_api_test_case(
768
849
  test_case=test_case, index=count if not _is_assert_test else None
769
850
  )
770
- new_cached_test_case: CachedTestCase = CachedTestCase()
771
- test_start_time = time.perf_counter()
772
- await measure_metrics_with_indicator(
773
- metrics=metrics,
774
- test_case=test_case,
775
- cached_test_case=cached_test_case,
776
- skip_on_missing_params=skip_on_missing_params,
777
- ignore_errors=ignore_errors,
778
- show_indicator=show_metrics_indicator,
779
- pbar_eval_id=pbar_test_case_id,
780
- progress=progress,
781
- )
851
+ try:
852
+ new_cached_test_case: CachedTestCase = CachedTestCase()
853
+ test_start_time = time.perf_counter()
782
854
 
783
- for metric in metrics:
784
- if metric.skipped:
785
- continue
855
+ await measure_metrics_with_indicator(
856
+ metrics=metrics,
857
+ test_case=test_case,
858
+ cached_test_case=cached_test_case,
859
+ skip_on_missing_params=skip_on_missing_params,
860
+ ignore_errors=ignore_errors,
861
+ show_indicator=show_metrics_indicator,
862
+ pbar_eval_id=pbar_test_case_id,
863
+ progress=progress,
864
+ )
865
+ except asyncio.CancelledError:
866
+ msg = (
867
+ "Timed out/cancelled while evaluating metric. "
868
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
869
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
870
+ )
871
+ for m in metrics:
872
+ if getattr(m, "skipped", False):
873
+ continue
874
+ # If the task never finished and didn't set a terminal state, mark it now
875
+ if getattr(m, "success", None) is None and not getattr(
876
+ m, "error", None
877
+ ):
878
+ m.success = False
879
+ m.error = msg
880
+ if not ignore_errors:
881
+ raise
882
+ finally:
883
+ for metric in metrics:
884
+ if metric.skipped:
885
+ continue
786
886
 
787
- metric_data = create_metric_data(metric)
788
- api_test_case.update_metric_data(metric_data)
887
+ metric_data = create_metric_data(metric)
888
+ api_test_case.update_metric_data(metric_data)
789
889
 
790
- if metric.error is None:
791
- cache_metric_data = deepcopy(metric_data)
792
- cache_metric_data.evaluation_cost = (
793
- 0 # Create new copy and save 0 for cost
794
- )
795
- updated_cached_metric_data = CachedMetricData(
796
- metric_data=cache_metric_data,
797
- metric_configuration=Cache.create_metric_configuration(metric),
798
- )
799
- new_cached_test_case.cached_metrics_data.append(
800
- updated_cached_metric_data
801
- )
890
+ if metric.error is None:
891
+ cache_metric_data = deepcopy(metric_data)
892
+ cache_metric_data.evaluation_cost = (
893
+ 0 # Create new copy and save 0 for cost
894
+ )
895
+ updated_cached_metric_data = CachedMetricData(
896
+ metric_data=cache_metric_data,
897
+ metric_configuration=Cache.create_metric_configuration(
898
+ metric
899
+ ),
900
+ )
901
+ new_cached_test_case.cached_metrics_data.append(
902
+ updated_cached_metric_data
903
+ )
802
904
 
803
- test_end_time = time.perf_counter()
804
- run_duration = test_end_time - test_start_time
805
- # Quick hack to check if all metrics were from cache
806
- if run_duration < 1:
807
- run_duration = 0
808
- api_test_case.update_run_duration(run_duration)
809
-
810
- ### Update Test Run ###
811
- test_run_manager.update_test_run(api_test_case, test_case)
812
-
813
- ### Cache Test Run ###
814
- global_test_run_cache_manager.cache_test_case(
815
- test_case,
816
- new_cached_test_case,
817
- test_run.hyperparameters,
818
- )
819
- global_test_run_cache_manager.cache_test_case(
820
- test_case,
821
- new_cached_test_case,
822
- test_run.hyperparameters,
823
- to_temp=True,
824
- )
905
+ test_end_time = time.perf_counter()
906
+ run_duration = test_end_time - test_start_time
907
+ # Quick hack to check if all metrics were from cache
908
+ if run_duration < 1:
909
+ run_duration = 0
910
+ api_test_case.update_run_duration(run_duration)
911
+
912
+ ### Update Test Run ###
913
+ test_run_manager.update_test_run(api_test_case, test_case)
825
914
 
826
- test_results.append(create_test_result(api_test_case))
827
- update_pbar(progress, pbar_id)
915
+ ### Cache Test Run ###
916
+ global_test_run_cache_manager.cache_test_case(
917
+ test_case,
918
+ new_cached_test_case,
919
+ test_run.hyperparameters,
920
+ )
921
+ global_test_run_cache_manager.cache_test_case(
922
+ test_case,
923
+ new_cached_test_case,
924
+ test_run.hyperparameters,
925
+ to_temp=True,
926
+ )
927
+
928
+ test_results.append(create_test_result(api_test_case))
929
+ update_pbar(progress, pbar_id)
828
930
 
829
931
 
830
932
  async def _a_execute_mllm_test_cases(
@@ -856,31 +958,50 @@ async def _a_execute_mllm_test_cases(
856
958
  test_case=test_case, index=count if not _is_assert_test else None
857
959
  )
858
960
  test_start_time = time.perf_counter()
859
- await measure_metrics_with_indicator(
860
- metrics=metrics,
861
- test_case=test_case,
862
- cached_test_case=None,
863
- skip_on_missing_params=skip_on_missing_params,
864
- ignore_errors=ignore_errors,
865
- show_indicator=show_metrics_indicator,
866
- pbar_eval_id=pbar_test_case_id,
867
- progress=progress,
868
- )
869
- for metric in metrics:
870
- if metric.skipped:
871
- continue
961
+ try:
962
+ await measure_metrics_with_indicator(
963
+ metrics=metrics,
964
+ test_case=test_case,
965
+ cached_test_case=None,
966
+ skip_on_missing_params=skip_on_missing_params,
967
+ ignore_errors=ignore_errors,
968
+ show_indicator=show_metrics_indicator,
969
+ pbar_eval_id=pbar_test_case_id,
970
+ progress=progress,
971
+ )
972
+ except asyncio.CancelledError:
973
+ msg = (
974
+ "Timed out/cancelled while evaluating metric. "
975
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
976
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
977
+ )
978
+ for m in metrics:
979
+ if getattr(m, "skipped", False):
980
+ continue
981
+ # If the task never finished and didn't set a terminal state, mark it now
982
+ if getattr(m, "success", None) is None and not getattr(
983
+ m, "error", None
984
+ ):
985
+ m.success = False
986
+ m.error = msg
987
+ if not ignore_errors:
988
+ raise
989
+ finally:
990
+ for metric in metrics:
991
+ if metric.skipped:
992
+ continue
872
993
 
873
- metric_data = create_metric_data(metric)
874
- api_test_case.update_metric_data(metric_data)
994
+ metric_data = create_metric_data(metric)
995
+ api_test_case.update_metric_data(metric_data)
875
996
 
876
- test_end_time = time.perf_counter()
877
- run_duration = test_end_time - test_start_time
878
- api_test_case.update_run_duration(run_duration)
997
+ test_end_time = time.perf_counter()
998
+ run_duration = test_end_time - test_start_time
999
+ api_test_case.update_run_duration(run_duration)
879
1000
 
880
- ### Update Test Run ###
881
- test_run_manager.update_test_run(api_test_case, test_case)
882
- test_results.append(create_test_result(api_test_case))
883
- update_pbar(progress, pbar_id)
1001
+ ### Update Test Run ###
1002
+ test_run_manager.update_test_run(api_test_case, test_case)
1003
+ test_results.append(create_test_result(api_test_case))
1004
+ update_pbar(progress, pbar_id)
884
1005
 
885
1006
 
886
1007
  async def _a_execute_conversational_test_cases(
@@ -915,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
915
1036
  )
916
1037
 
917
1038
  test_start_time = time.perf_counter()
918
- await measure_metrics_with_indicator(
919
- metrics=metrics,
920
- test_case=test_case,
921
- cached_test_case=None,
922
- skip_on_missing_params=skip_on_missing_params,
923
- ignore_errors=ignore_errors,
924
- show_indicator=show_metrics_indicator,
925
- pbar_eval_id=pbar_test_case_id,
926
- progress=progress,
927
- )
928
- for metric in metrics:
929
- if metric.skipped:
930
- continue
931
1039
 
932
- metric_data = create_metric_data(metric)
933
- api_test_case.update_metric_data(metric_data)
1040
+ try:
1041
+ await measure_metrics_with_indicator(
1042
+ metrics=metrics,
1043
+ test_case=test_case,
1044
+ cached_test_case=None,
1045
+ skip_on_missing_params=skip_on_missing_params,
1046
+ ignore_errors=ignore_errors,
1047
+ show_indicator=show_metrics_indicator,
1048
+ pbar_eval_id=pbar_test_case_id,
1049
+ progress=progress,
1050
+ )
934
1051
 
935
- test_end_time = time.perf_counter()
936
- if len(metrics) > 0:
937
- run_duration = test_end_time - test_start_time
938
- api_test_case.update_run_duration(run_duration)
1052
+ except asyncio.CancelledError:
1053
+ msg = (
1054
+ "Timed out/cancelled while evaluating metric. "
1055
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1056
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1057
+ )
1058
+ for m in metrics:
1059
+ if getattr(m, "skipped", False):
1060
+ continue
1061
+ # If the task never finished and didn't set a terminal state, mark it now
1062
+ if getattr(m, "success", None) is None and not getattr(
1063
+ m, "error", None
1064
+ ):
1065
+ m.success = False
1066
+ m.error = msg
1067
+ if not ignore_errors:
1068
+ raise
939
1069
 
940
- ### Update Test Run ###
941
- test_run_manager.update_test_run(api_test_case, test_case)
1070
+ finally:
1071
+ for metric in metrics:
1072
+ if metric.skipped:
1073
+ continue
1074
+
1075
+ metric_data = create_metric_data(metric)
1076
+ api_test_case.update_metric_data(metric_data)
1077
+
1078
+ test_end_time = time.perf_counter()
1079
+ if len(metrics) > 0:
1080
+ run_duration = test_end_time - test_start_time
1081
+ api_test_case.update_run_duration(run_duration)
942
1082
 
943
- test_results.append(create_test_result(api_test_case))
944
- update_pbar(progress, pbar_id)
1083
+ ### Update Test Run ###
1084
+ test_run_manager.update_test_run(api_test_case, test_case)
1085
+
1086
+ test_results.append(create_test_result(api_test_case))
1087
+ update_pbar(progress, pbar_id)
945
1088
 
946
1089
 
947
1090
  ###########################################
@@ -965,7 +1108,11 @@ def execute_agentic_test_cases(
965
1108
  test_run_manager = global_test_run_manager
966
1109
 
967
1110
  test_run_manager.save_to_disk = cache_config.write_cache
968
- test_run_manager.get_test_run(identifier=identifier)
1111
+ test_run = test_run_manager.get_test_run(identifier=identifier)
1112
+ if test_run is None:
1113
+ # Create if not found
1114
+ test_run_manager.create_test_run(identifier=identifier)
1115
+ test_run = test_run_manager.get_test_run(identifier=identifier)
969
1116
 
970
1117
  local_trace_manager = trace_manager
971
1118
  local_trace_manager.evaluating = True
@@ -975,152 +1122,137 @@ def execute_agentic_test_cases(
975
1122
  progress: Optional[Progress] = None,
976
1123
  pbar_id: Optional[int] = None,
977
1124
  ):
978
- count = 0
1125
+ count = -1
979
1126
  show_metric_indicator = (
980
1127
  display_config.show_indicator and not _use_bar_indicator
981
1128
  )
982
1129
 
983
1130
  for golden in goldens:
984
- with capture_evaluation_run("golden"):
985
- count += 1
986
- total_tags = count_observe_decorators_in_module(
987
- observed_callback
988
- )
989
- pbar_tags_id = add_pbar(
990
- progress,
991
- f" ⚡ Invoking observed callback (#{count})",
992
- total=total_tags,
993
- )
994
-
995
- with Observer(
996
- "custom",
997
- func_name="Test Wrapper",
998
- _progress=progress,
999
- _pbar_callback_id=pbar_tags_id,
1000
- ):
1001
-
1002
- if asyncio.iscoroutinefunction(observed_callback):
1003
- loop = get_or_create_event_loop()
1004
- coro = observed_callback(golden.input)
1005
- loop.run_until_complete(
1006
- asyncio.wait_for(
1007
- coro,
1008
- timeout=_per_task_timeout(),
1009
- )
1010
- )
1011
- else:
1012
- observed_callback(golden.input)
1013
- current_trace: Trace = current_trace_context.get()
1014
-
1015
- update_pbar(progress, pbar_tags_id, advance=total_tags)
1016
- update_pbar(progress, pbar_id)
1017
-
1018
- # Create empty trace api for llm api test case
1019
- trace_api = create_api_trace(current_trace, golden)
1020
-
1021
- # Format golden as test case to create llm api test case
1022
- test_case = LLMTestCase(
1023
- input=golden.input,
1024
- actual_output=(
1025
- str(current_trace.output)
1026
- if current_trace.output is not None
1027
- else None
1028
- ),
1029
- expected_output=current_trace.expected_output,
1030
- context=current_trace.context,
1031
- retrieval_context=current_trace.retrieval_context,
1032
- additional_metadata=golden.additional_metadata,
1033
- tools_called=current_trace.tools_called,
1034
- expected_tools=current_trace.expected_tools,
1035
- comments=golden.comments,
1036
- name=golden.name,
1037
- _dataset_alias=golden._dataset_alias,
1038
- _dataset_id=golden._dataset_id,
1039
- )
1040
- api_test_case = create_api_test_case(
1041
- test_case=test_case,
1042
- trace=trace_api,
1043
- index=count if not _is_assert_test else None,
1044
- )
1131
+ count += 1
1045
1132
 
1046
- # Run DFS to calculate metrics synchronously
1047
- def dfs(
1048
- span: BaseSpan,
1049
- progress: Optional[Progress] = None,
1050
- pbar_eval_id: Optional[int] = None,
1051
- ):
1052
- # Create API Span
1053
- metrics: List[BaseMetric] = list(span.metrics or [])
1054
- api_span: BaseApiSpan = (
1055
- trace_manager._convert_span_to_api_span(span)
1133
+ pbar_case_increments = (
1134
+ 0 # tracks how many times we advance `pbar_id` for this golden
1135
+ )
1136
+ emitted_trace = set()
1137
+ current_trace: Optional[Trace] = None
1138
+ trace_api = None
1139
+ api_test_case = None
1140
+ test_case = None
1141
+
1142
+ def _run_golden():
1143
+ nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
1144
+ # keep the evaluation context inside the timed function
1145
+ with capture_evaluation_run("golden"):
1146
+ total_tags = count_observe_decorators_in_module(
1147
+ observed_callback
1148
+ )
1149
+ pbar_tags_id = add_pbar(
1150
+ progress,
1151
+ f" ⚡ Invoking observed callback (#{count})",
1152
+ total=total_tags,
1056
1153
  )
1057
1154
 
1058
- if isinstance(span, AgentSpan):
1059
- trace_api.agent_spans.append(api_span)
1060
- elif isinstance(span, LlmSpan):
1061
- trace_api.llm_spans.append(api_span)
1062
- log_prompt(span, test_run_manager)
1063
- elif isinstance(span, RetrieverSpan):
1064
- trace_api.retriever_spans.append(api_span)
1065
- elif isinstance(span, ToolSpan):
1066
- trace_api.tool_spans.append(api_span)
1067
- else:
1068
- trace_api.base_spans.append(api_span)
1069
-
1070
- # Skip errored trace/span
1071
- if _skip_metrics_for_error(span=span, trace=current_trace):
1072
- api_span.status = TraceSpanApiStatus.ERRORED
1073
- api_span.error = span.error or _trace_error(
1074
- current_trace
1075
- )
1076
- if progress and pbar_eval_id is not None:
1077
- update_pbar(
1078
- progress,
1079
- pbar_eval_id,
1080
- advance=count_metrics_in_span_subtree(span),
1155
+ with Observer(
1156
+ "custom",
1157
+ func_name="Test Wrapper",
1158
+ _progress=progress,
1159
+ _pbar_callback_id=pbar_tags_id,
1160
+ ):
1161
+ if asyncio.iscoroutinefunction(observed_callback):
1162
+ loop = get_or_create_event_loop()
1163
+ coro = observed_callback(golden.input)
1164
+ loop.run_until_complete(
1165
+ _await_with_outer_deadline(
1166
+ coro,
1167
+ timeout=_per_task_timeout(),
1168
+ )
1081
1169
  )
1082
- return
1170
+ else:
1171
+ observed_callback(golden.input)
1083
1172
 
1084
- for child in span.children:
1085
- dfs(child, progress, pbar_eval_id)
1173
+ # we have a trace now
1174
+ current_trace = current_trace_context.get()
1086
1175
 
1087
- if not span.metrics:
1088
- return
1089
- requires_trace = any(
1090
- metric.requires_trace for metric in span.metrics
1176
+ update_pbar(progress, pbar_tags_id, advance=total_tags)
1177
+ update_pbar(progress, pbar_id)
1178
+ pbar_case_increments += 1
1179
+
1180
+ # Create empty trace api for llm api test case
1181
+ trace_api = create_api_trace(current_trace, golden)
1182
+
1183
+ # Build the test case and api test case
1184
+ test_case = LLMTestCase(
1185
+ input=golden.input,
1186
+ actual_output=(
1187
+ str(current_trace.output)
1188
+ if current_trace
1189
+ and current_trace.output is not None
1190
+ else None
1191
+ ),
1192
+ expected_output=(
1193
+ current_trace.expected_output
1194
+ if current_trace
1195
+ else None
1196
+ ),
1197
+ context=(
1198
+ current_trace.context if current_trace else None
1199
+ ),
1200
+ retrieval_context=(
1201
+ current_trace.retrieval_context
1202
+ if current_trace
1203
+ else None
1204
+ ),
1205
+ additional_metadata=golden.additional_metadata,
1206
+ tools_called=(
1207
+ current_trace.tools_called
1208
+ if current_trace
1209
+ else None
1210
+ ),
1211
+ expected_tools=(
1212
+ current_trace.expected_tools
1213
+ if current_trace
1214
+ else None
1215
+ ),
1216
+ comments=golden.comments,
1217
+ name=golden.name,
1218
+ _dataset_alias=golden._dataset_alias,
1219
+ _dataset_id=golden._dataset_id,
1220
+ )
1221
+ api_test_case = create_api_test_case(
1222
+ test_case=test_case,
1223
+ trace=trace_api,
1224
+ index=count if not _is_assert_test else None,
1091
1225
  )
1092
1226
 
1093
- llm_test_case = None
1094
- if span.input is not None:
1095
- llm_test_case = LLMTestCase(
1096
- input=str(span.input),
1097
- actual_output=(
1098
- str(span.output)
1099
- if span.output is not None
1100
- else None
1101
- ),
1102
- expected_output=span.expected_output,
1103
- context=span.context,
1104
- retrieval_context=span.retrieval_context,
1105
- tools_called=span.tools_called,
1106
- expected_tools=span.expected_tools,
1227
+ # DFS and trace metric evaluation
1228
+ def dfs(
1229
+ span: BaseSpan,
1230
+ progress: Optional[Progress] = None,
1231
+ pbar_eval_id: Optional[int] = None,
1232
+ ):
1233
+ metrics: List[BaseMetric] = list(span.metrics or [])
1234
+ api_span: BaseApiSpan = (
1235
+ trace_manager._convert_span_to_api_span(span)
1107
1236
  )
1108
1237
 
1109
- # add trace if task completion
1110
- if requires_trace:
1111
- if llm_test_case is None:
1112
- llm_test_case = LLMTestCase(input="None")
1113
- llm_test_case._trace_dict = (
1114
- trace_manager.create_nested_spans_dict(span)
1115
- )
1116
- else:
1117
- if llm_test_case is None:
1238
+ if isinstance(span, AgentSpan):
1239
+ trace_api.agent_spans.append(api_span)
1240
+ elif isinstance(span, LlmSpan):
1241
+ trace_api.llm_spans.append(api_span)
1242
+ log_prompt(span, test_run_manager)
1243
+ elif isinstance(span, RetrieverSpan):
1244
+ trace_api.retriever_spans.append(api_span)
1245
+ elif isinstance(span, ToolSpan):
1246
+ trace_api.tool_spans.append(api_span)
1247
+ else:
1248
+ trace_api.base_spans.append(api_span)
1249
+
1250
+ if _skip_metrics_for_error(
1251
+ span=span, trace=current_trace
1252
+ ):
1118
1253
  api_span.status = TraceSpanApiStatus.ERRORED
1119
- api_span.error = format_error_text(
1120
- DeepEvalError(
1121
- "Span has metrics but no LLMTestCase. "
1122
- "Are you sure you called `update_current_span()`?"
1123
- )
1254
+ api_span.error = span.error or _trace_error(
1255
+ current_trace
1124
1256
  )
1125
1257
  if progress and pbar_eval_id is not None:
1126
1258
  update_pbar(
@@ -1130,155 +1262,386 @@ def execute_agentic_test_cases(
1130
1262
  )
1131
1263
  return
1132
1264
 
1133
- # Preparing metric calculation
1134
- api_span.metrics_data = []
1135
- for metric in metrics:
1136
- metric.skipped = False
1137
- metric.error = None
1138
- if display_config.verbose_mode is not None:
1139
- metric.verbose_mode = display_config.verbose_mode
1140
-
1141
- # Metric calculation
1142
- for metric in metrics:
1143
- metric_data = None
1144
- res = _execute_metric(
1145
- metric=metric,
1146
- test_case=llm_test_case,
1147
- show_metric_indicator=show_metric_indicator,
1148
- in_component=True,
1149
- error_config=error_config,
1150
- )
1151
- if res == "skip":
1152
- continue
1153
- metric_data = create_metric_data(metric)
1154
- api_span.metrics_data.append(metric_data)
1155
- api_test_case.update_status(metric_data.success)
1156
- update_pbar(progress, pbar_eval_id)
1157
-
1158
- trace_level_metrics_count = (
1159
- len(current_trace.metrics) if current_trace.metrics else 0
1160
- )
1161
- pbar_eval_id = add_pbar(
1162
- progress,
1163
- f" 🎯 Evaluating component(s) (#{count})",
1164
- total=count_metrics_in_trace(trace=current_trace)
1165
- + trace_level_metrics_count,
1166
- )
1167
-
1168
- start_time = time.perf_counter()
1265
+ # evaluate children first
1266
+ for child in span.children:
1267
+ dfs(child, progress, pbar_eval_id)
1169
1268
 
1170
- skip_metrics_for_this_golden = False
1171
- # Handle trace-level metrics
1172
- if _skip_metrics_for_error(trace=current_trace):
1173
- trace_api.status = TraceSpanApiStatus.ERRORED
1174
- if progress and pbar_eval_id is not None:
1175
- update_pbar(
1176
- progress,
1177
- pbar_eval_id,
1178
- advance=count_total_metrics_for_trace(
1179
- current_trace
1180
- ),
1269
+ # If there are no metrics, then there is nothing to do on this span.
1270
+ if not metrics:
1271
+ return
1272
+
1273
+ has_task_completion = any(
1274
+ isinstance(metric, TaskCompletionMetric)
1275
+ for metric in metrics
1181
1276
  )
1182
- else:
1183
- if current_trace.metrics:
1277
+
1184
1278
  requires_trace = any(
1185
- metric.requires_trace
1186
- for metric in current_trace.metrics
1279
+ getattr(metric, "requires_trace", False)
1280
+ for metric in metrics
1187
1281
  )
1188
1282
 
1189
1283
  llm_test_case = None
1190
- if current_trace.input:
1284
+ if span.input is not None:
1191
1285
  llm_test_case = LLMTestCase(
1192
- input=str(current_trace.input),
1286
+ input=str(span.input),
1193
1287
  actual_output=(
1194
- str(current_trace.output)
1195
- if current_trace.output is not None
1288
+ str(span.output)
1289
+ if span.output is not None
1196
1290
  else None
1197
1291
  ),
1198
- expected_output=current_trace.expected_output,
1199
- context=current_trace.context,
1200
- retrieval_context=current_trace.retrieval_context,
1201
- tools_called=current_trace.tools_called,
1202
- expected_tools=current_trace.expected_tools,
1292
+ expected_output=span.expected_output,
1293
+ context=span.context,
1294
+ retrieval_context=span.retrieval_context,
1295
+ tools_called=span.tools_called,
1296
+ expected_tools=span.expected_tools,
1203
1297
  )
1204
- if requires_trace:
1298
+
1299
+ # If any metric needs a trace tree or a completion verdict, attach the trace
1300
+ if has_task_completion or requires_trace:
1205
1301
  if llm_test_case is None:
1206
1302
  llm_test_case = LLMTestCase(input="None")
1207
1303
  llm_test_case._trace_dict = (
1208
- trace_manager.create_nested_spans_dict(
1209
- current_trace.root_spans[0]
1210
- )
1304
+ trace_manager.create_nested_spans_dict(span)
1211
1305
  )
1212
1306
  else:
1307
+ # Without a test case we cannot evaluate span metrics
1213
1308
  if llm_test_case is None:
1214
- current_trace.status = TraceSpanStatus.ERRORED
1215
- trace_api.status = TraceSpanApiStatus.ERRORED
1216
- if current_trace.root_spans:
1217
- current_trace.root_spans[0].status = (
1218
- TraceSpanStatus.ERRORED
1219
- )
1220
- current_trace.root_spans[0].error = (
1221
- format_error_text(
1222
- DeepEvalError(
1223
- "Trace has metrics but no LLMTestCase (missing input/output). "
1224
- "Are you sure you called `update_current_trace()`?"
1225
- )
1226
- )
1309
+ api_span.status = TraceSpanApiStatus.ERRORED
1310
+ api_span.error = format_error_text(
1311
+ DeepEvalError(
1312
+ "Span has metrics but no LLMTestCase. "
1313
+ "Are you sure you called `update_current_span()`?"
1227
1314
  )
1315
+ )
1228
1316
  if progress and pbar_eval_id is not None:
1229
1317
  update_pbar(
1230
1318
  progress,
1231
1319
  pbar_eval_id,
1232
- advance=count_total_metrics_for_trace(
1233
- current_trace
1320
+ advance=count_metrics_in_span_subtree(
1321
+ span
1234
1322
  ),
1235
1323
  )
1236
- skip_metrics_for_this_golden = True
1324
+ return
1325
+
1326
+ # Preparing metric calculation
1327
+ api_span.metrics_data = []
1328
+ for metric in metrics:
1329
+ metric.skipped = False
1330
+ metric.error = None
1331
+ if display_config.verbose_mode is not None:
1332
+ metric.verbose_mode = (
1333
+ display_config.verbose_mode
1334
+ )
1237
1335
 
1238
- if not skip_metrics_for_this_golden:
1239
- for metric in current_trace.metrics:
1240
- metric.skipped = False
1241
- metric.error = None
1242
- if display_config.verbose_mode is not None:
1243
- metric.verbose_mode = (
1244
- display_config.verbose_mode
1336
+ # Metric calculation
1337
+ for metric in metrics:
1338
+ res = _execute_metric(
1339
+ metric=metric,
1340
+ test_case=llm_test_case,
1341
+ show_metric_indicator=show_metric_indicator,
1342
+ in_component=True,
1343
+ error_config=error_config,
1344
+ )
1345
+ if res == "skip":
1346
+ continue
1347
+ metric_data = create_metric_data(metric)
1348
+ api_span.metrics_data.append(metric_data)
1349
+ api_test_case.update_status(metric_data.success)
1350
+ update_pbar(progress, pbar_eval_id)
1351
+
1352
+ trace_level_metrics_count = (
1353
+ len(current_trace.metrics)
1354
+ if current_trace and current_trace.metrics
1355
+ else 0
1356
+ )
1357
+ pbar_eval_id = add_pbar(
1358
+ progress,
1359
+ f" 🎯 Evaluating component(s) (#{count})",
1360
+ total=count_metrics_in_trace(trace=current_trace)
1361
+ + trace_level_metrics_count,
1362
+ )
1363
+
1364
+ start_time = time.perf_counter()
1365
+
1366
+ skip_metrics_for_this_golden = False
1367
+ if _skip_metrics_for_error(trace=current_trace):
1368
+ trace_api.status = TraceSpanApiStatus.ERRORED
1369
+ if progress and pbar_eval_id is not None:
1370
+ update_pbar(
1371
+ progress,
1372
+ pbar_eval_id,
1373
+ advance=count_total_metrics_for_trace(
1374
+ current_trace
1375
+ ),
1376
+ )
1377
+ else:
1378
+ if current_trace and current_trace.metrics:
1379
+ has_task_completion = any(
1380
+ isinstance(metric, TaskCompletionMetric)
1381
+ for metric in current_trace.metrics
1382
+ )
1383
+ requires_trace = any(
1384
+ getattr(metric, "requires_trace", False)
1385
+ for metric in current_trace.metrics
1386
+ )
1387
+ llm_test_case = None
1388
+ if current_trace.input:
1389
+ llm_test_case = LLMTestCase(
1390
+ input=str(current_trace.input),
1391
+ actual_output=(
1392
+ str(current_trace.output)
1393
+ if current_trace.output is not None
1394
+ else None
1395
+ ),
1396
+ expected_output=current_trace.expected_output,
1397
+ context=current_trace.context,
1398
+ retrieval_context=current_trace.retrieval_context,
1399
+ tools_called=current_trace.tools_called,
1400
+ expected_tools=current_trace.expected_tools,
1401
+ )
1402
+ if has_task_completion or requires_trace:
1403
+ if llm_test_case is None:
1404
+ llm_test_case = LLMTestCase(input="None")
1405
+ llm_test_case._trace_dict = (
1406
+ trace_manager.create_nested_spans_dict(
1407
+ current_trace.root_spans[0]
1408
+ )
1409
+ )
1410
+ else:
1411
+ if llm_test_case is None:
1412
+ current_trace.status = (
1413
+ TraceSpanStatus.ERRORED
1414
+ )
1415
+ trace_api.status = (
1416
+ TraceSpanApiStatus.ERRORED
1245
1417
  )
1418
+ if current_trace.root_spans:
1419
+ current_trace.root_spans[0].status = (
1420
+ TraceSpanStatus.ERRORED
1421
+ )
1422
+ current_trace.root_spans[0].error = (
1423
+ format_error_text(
1424
+ DeepEvalError(
1425
+ "Trace has metrics but no LLMTestCase (missing input/output). "
1426
+ "Are you sure you called `update_current_trace()`?"
1427
+ )
1428
+ )
1429
+ )
1430
+ if progress and pbar_eval_id is not None:
1431
+ update_pbar(
1432
+ progress,
1433
+ pbar_eval_id,
1434
+ advance=count_total_metrics_for_trace(
1435
+ current_trace
1436
+ ),
1437
+ )
1438
+ skip_metrics_for_this_golden = True
1439
+
1440
+ if not skip_metrics_for_this_golden:
1441
+ for metric in current_trace.metrics:
1442
+ metric.skipped = False
1443
+ metric.error = None
1444
+ if display_config.verbose_mode is not None:
1445
+ metric.verbose_mode = (
1446
+ display_config.verbose_mode
1447
+ )
1246
1448
 
1247
- trace_api.metrics_data = []
1248
- for metric in current_trace.metrics:
1249
- res = _execute_metric(
1250
- metric=metric,
1251
- test_case=llm_test_case,
1252
- show_metric_indicator=show_metric_indicator,
1253
- in_component=True,
1254
- error_config=error_config,
1449
+ trace_api.metrics_data = []
1450
+ for metric in current_trace.metrics:
1451
+ res = _execute_metric(
1452
+ metric=metric,
1453
+ test_case=llm_test_case,
1454
+ show_metric_indicator=show_metric_indicator,
1455
+ in_component=True,
1456
+ error_config=error_config,
1457
+ )
1458
+ if res == "skip":
1459
+ continue
1460
+
1461
+ if not metric.skipped:
1462
+ metric_data = create_metric_data(metric)
1463
+ trace_api.metrics_data.append(
1464
+ metric_data
1465
+ )
1466
+ api_test_case.update_metric_data(
1467
+ metric_data
1468
+ )
1469
+ api_test_case.update_status(
1470
+ metric_data.success
1471
+ )
1472
+ emitted_trace.add(id(metric))
1473
+ update_pbar(progress, pbar_eval_id)
1474
+
1475
+ # handle span metrics
1476
+ dfs(
1477
+ current_trace.root_spans[0],
1478
+ progress,
1479
+ pbar_eval_id,
1480
+ )
1481
+
1482
+ # TODO: Do I need this block, or is it duplicated in finally?
1483
+ end_time = time.perf_counter()
1484
+ run_duration = end_time - start_time
1485
+ api_test_case.update_run_duration(run_duration)
1486
+ test_run_manager.update_test_run(api_test_case, test_case)
1487
+ test_results.append(create_test_result(api_test_case))
1488
+ test_results.extend(extract_trace_test_results(trace_api))
1489
+ update_pbar(progress, pbar_id)
1490
+ pbar_case_increments += 1
1491
+
1492
+ # run the golden with a timeout
1493
+ start_time = time.perf_counter()
1494
+ deadline = _per_task_timeout()
1495
+
1496
+ try:
1497
+ run_sync_with_timeout(_run_golden, deadline)
1498
+ except (asyncio.TimeoutError, TimeoutError):
1499
+ # mark any not yet finished trace level and span level metrics as timed out.
1500
+ msg = (
1501
+ f"Timed out after {deadline:.2f}s while executing agentic test case. "
1502
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1503
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1504
+ )
1505
+
1506
+ if current_trace is not None:
1507
+ # Trace-level metrics
1508
+ if getattr(current_trace, "metrics", None):
1509
+ for m in current_trace.metrics:
1510
+ if getattr(m, "skipped", False):
1511
+ continue
1512
+ # if already has a terminal state, leave it alone
1513
+ if getattr(
1514
+ m, "success", None
1515
+ ) is not None or getattr(m, "error", None):
1516
+ continue
1517
+ m.success = False
1518
+ m.error = msg
1519
+
1520
+ # span level metrics, walk the tree
1521
+ def _walk(span):
1522
+ for child in getattr(span, "children", []) or []:
1523
+ _walk(child)
1524
+ for m in list(getattr(span, "metrics", []) or []):
1525
+ if getattr(m, "skipped", False):
1526
+ continue
1527
+ if getattr(
1528
+ m, "success", None
1529
+ ) is not None or getattr(m, "error", None):
1530
+ continue
1531
+ m.success = False
1532
+ m.error = msg
1533
+
1534
+ for root in getattr(current_trace, "root_spans", []) or []:
1535
+ _walk(root)
1536
+
1537
+ # raise if we are not ignoring errors
1538
+ if not error_config.ignore_errors:
1539
+ raise
1540
+
1541
+ finally:
1542
+ try:
1543
+ # Ensure we have an api_test_case to attach results to.
1544
+ if api_test_case is None:
1545
+ # build a minimal test_case
1546
+ if test_case is None:
1547
+ out = (
1548
+ str(current_trace.output)
1549
+ if (
1550
+ current_trace is not None
1551
+ and current_trace.output is not None
1255
1552
  )
1256
- if res == "skip":
1553
+ else None
1554
+ )
1555
+ test_case = LLMTestCase(
1556
+ input=golden.input,
1557
+ actual_output=out,
1558
+ expected_output=(
1559
+ current_trace.expected_output
1560
+ if current_trace
1561
+ else None
1562
+ ),
1563
+ context=(
1564
+ current_trace.context
1565
+ if current_trace
1566
+ else None
1567
+ ),
1568
+ retrieval_context=(
1569
+ current_trace.retrieval_context
1570
+ if current_trace
1571
+ else None
1572
+ ),
1573
+ additional_metadata=golden.additional_metadata,
1574
+ tools_called=(
1575
+ current_trace.tools_called
1576
+ if current_trace
1577
+ else None
1578
+ ),
1579
+ expected_tools=(
1580
+ current_trace.expected_tools
1581
+ if current_trace
1582
+ else None
1583
+ ),
1584
+ comments=golden.comments,
1585
+ name=golden.name,
1586
+ _dataset_alias=golden._dataset_alias,
1587
+ _dataset_id=golden._dataset_id,
1588
+ )
1589
+
1590
+ # Create a trace API if we have a trace
1591
+ if trace_api is None and current_trace is not None:
1592
+ trace_api = create_api_trace(current_trace, golden)
1593
+
1594
+ api_test_case = create_api_test_case(
1595
+ test_case=test_case,
1596
+ trace=trace_api,
1597
+ index=count if not _is_assert_test else None,
1598
+ )
1599
+
1600
+ if test_run is not None:
1601
+ test_run_manager.set_test_run(test_run)
1602
+
1603
+ if api_test_case.success is None:
1604
+ api_test_case.update_status(False)
1605
+
1606
+ # try to update metric data
1607
+ if current_trace is not None:
1608
+ if current_trace.metrics:
1609
+ for m in current_trace.metrics:
1610
+ if getattr(m, "skipped", False):
1611
+ continue
1612
+ if id(m) in emitted_trace:
1257
1613
  continue
1614
+ api_test_case.update_metric_data(
1615
+ create_metric_data(m)
1616
+ )
1258
1617
 
1259
- if not metric.skipped:
1260
- metric_data = create_metric_data(metric)
1261
- trace_api.metrics_data.append(metric_data)
1262
- api_test_case.update_metric_data(
1263
- metric_data
1264
- )
1265
- api_test_case.update_status(
1266
- metric_data.success
1267
- )
1268
- update_pbar(progress, pbar_eval_id)
1618
+ # Finalize duration and persist
1619
+ elapsed = time.perf_counter() - start_time
1620
+ api_test_case.update_run_duration(
1621
+ elapsed if elapsed >= 0 else deadline
1622
+ )
1269
1623
 
1270
- # Then handle span-level metrics
1271
- dfs(current_trace.root_spans[0], progress, pbar_eval_id)
1624
+ if (
1625
+ api_test_case.metrics_data == []
1626
+ and api_test_case.trace is None
1627
+ ):
1628
+ api_test_case.metrics_data = None
1272
1629
 
1273
- end_time = time.perf_counter()
1274
- run_duration = end_time - start_time
1275
- # Update test run
1276
- api_test_case.update_run_duration(run_duration)
1277
- test_run_manager.update_test_run(api_test_case, test_case)
1278
- test_results.append(create_test_result(api_test_case))
1279
- test_results.extend(extract_trace_test_results(trace_api))
1630
+ test_run_manager.update_test_run(api_test_case, test_case)
1631
+ test_results.append(create_test_result(api_test_case))
1280
1632
 
1281
- update_pbar(progress, pbar_id)
1633
+ if trace_api is not None:
1634
+ test_results.extend(
1635
+ extract_trace_test_results(trace_api)
1636
+ )
1637
+
1638
+ missing = 2 - pbar_case_increments
1639
+ if missing > 0:
1640
+ update_pbar(progress, pbar_id, advance=missing)
1641
+
1642
+ finally:
1643
+ # nothing to clean here, but keep symmetry with other paths
1644
+ pass
1282
1645
 
1283
1646
  if display_config.show_indicator and _use_bar_indicator:
1284
1647
  progress = Progress(
@@ -1319,9 +1682,9 @@ async def a_execute_agentic_test_cases(
1319
1682
 
1320
1683
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
1321
1684
  async with semaphore:
1322
- return await asyncio.wait_for(
1323
- func(*args, **kwargs),
1324
- timeout=_per_task_timeout(),
1685
+ timeout = _per_task_timeout()
1686
+ return await _await_with_outer_deadline(
1687
+ func, *args, timeout=timeout, **kwargs
1325
1688
  )
1326
1689
 
1327
1690
  test_run_manager = global_test_run_manager
@@ -1374,7 +1737,7 @@ async def a_execute_agentic_test_cases(
1374
1737
  asyncio.gather(*tasks),
1375
1738
  timeout=_gather_timeout(),
1376
1739
  )
1377
- except asyncio.TimeoutError:
1740
+ except (asyncio.TimeoutError, TimeoutError):
1378
1741
  # Cancel any still-pending tasks and drain them
1379
1742
  for t in tasks:
1380
1743
  if not t.done():
@@ -1426,94 +1789,89 @@ async def _a_execute_agentic_test_case(
1426
1789
  progress: Optional[Progress] = None,
1427
1790
  pbar_id: Optional[int] = None,
1428
1791
  ):
1429
- if observed_callback:
1430
- total_tags = count_observe_decorators_in_module(observed_callback)
1431
- pbar_tags_id = add_pbar(
1432
- progress,
1433
- f" ⚡ Invoking observed callback (#{count})",
1434
- total=total_tags,
1435
- )
1792
+ test_start_time = time.perf_counter()
1793
+ current_trace = None
1794
+ trace_api = None
1795
+ test_case = None
1796
+ api_test_case = None
1797
+ try:
1798
+ if observed_callback:
1799
+ total_tags = count_observe_decorators_in_module(observed_callback)
1800
+ pbar_tags_id = add_pbar(
1801
+ progress,
1802
+ f" ⚡ Invoking observed callback (#{count})",
1803
+ total=total_tags,
1804
+ )
1436
1805
 
1437
- # Call callback and extract trace
1438
- with Observer(
1439
- "custom",
1440
- func_name="Test Wrapper",
1441
- _progress=progress,
1442
- _pbar_callback_id=pbar_tags_id,
1443
- ):
1444
- if asyncio.iscoroutinefunction(observed_callback):
1445
- await asyncio.wait_for(
1446
- observed_callback(golden.input),
1447
- timeout=_per_task_timeout(),
1448
- )
1449
- else:
1450
- observed_callback(golden.input)
1451
- current_trace: Trace = current_trace_context.get()
1806
+ # Call callback and extract trace
1807
+ with Observer(
1808
+ "custom",
1809
+ func_name="Test Wrapper",
1810
+ _progress=progress,
1811
+ _pbar_callback_id=pbar_tags_id,
1812
+ ):
1813
+ # get current_trace right away, we need it even if cancelled
1814
+ current_trace: Trace = current_trace_context.get()
1815
+ if asyncio.iscoroutinefunction(observed_callback):
1816
+ await _await_with_outer_deadline(
1817
+ observed_callback,
1818
+ golden.input,
1819
+ timeout=_per_task_timeout(),
1820
+ )
1821
+ else:
1822
+ observed_callback(golden.input)
1452
1823
 
1453
- update_pbar(progress, pbar_tags_id, advance=total_tags)
1454
- update_pbar(progress, pbar_id)
1824
+ update_pbar(progress, pbar_tags_id, advance=total_tags)
1825
+ update_pbar(progress, pbar_id)
1455
1826
 
1456
- elif trace:
1457
- current_trace = trace
1827
+ elif trace:
1828
+ current_trace = trace
1458
1829
 
1459
- if trace_metrics:
1460
- current_trace.metrics = trace_metrics
1830
+ trace_level_metrics_count = 0
1461
1831
 
1462
- # run evals through DFS
1463
- trace_api = create_api_trace(trace=current_trace, golden=golden)
1832
+ if trace_metrics:
1833
+ current_trace.metrics = trace_metrics
1464
1834
 
1465
- trace_level_metrics_count = (
1466
- len(current_trace.metrics) if current_trace.metrics else 0
1467
- )
1835
+ # run evals through DFS
1836
+ trace_api = create_api_trace(trace=current_trace, golden=golden)
1468
1837
 
1469
- pbar_eval_id = add_pbar(
1470
- progress,
1471
- f" 🎯 Evaluating component(s) (#{count})",
1472
- total=count_metrics_in_trace(trace=current_trace)
1473
- + trace_level_metrics_count,
1474
- )
1838
+ trace_level_metrics_count = (
1839
+ len(current_trace.metrics) if current_trace.metrics else 0
1840
+ )
1475
1841
 
1476
- test_case = LLMTestCase(
1477
- input=golden.input,
1478
- actual_output=(
1479
- str(current_trace.output)
1480
- if current_trace.output is not None
1481
- else None
1482
- ),
1483
- expected_output=current_trace.expected_output,
1484
- context=current_trace.context,
1485
- retrieval_context=current_trace.retrieval_context,
1486
- tools_called=current_trace.tools_called,
1487
- expected_tools=current_trace.expected_tools,
1488
- additional_metadata=golden.additional_metadata,
1489
- comments=golden.comments,
1490
- name=golden.name,
1491
- _dataset_alias=golden._dataset_alias,
1492
- _dataset_id=golden._dataset_id,
1493
- )
1494
- api_test_case = create_api_test_case(
1495
- test_case=test_case,
1496
- trace=trace_api,
1497
- index=count if not _is_assert_test else None,
1498
- )
1842
+ pbar_eval_id = add_pbar(
1843
+ progress,
1844
+ f" 🎯 Evaluating component(s) (#{count})",
1845
+ total=count_metrics_in_trace(trace=current_trace)
1846
+ + trace_level_metrics_count,
1847
+ )
1499
1848
 
1500
- await _a_execute_trace_test_case(
1501
- trace=current_trace,
1502
- trace_api=trace_api,
1503
- api_test_case=api_test_case,
1504
- ignore_errors=ignore_errors,
1505
- skip_on_missing_params=skip_on_missing_params,
1506
- show_indicator=show_indicator,
1507
- verbose_mode=verbose_mode,
1508
- progress=progress,
1509
- pbar_eval_id=pbar_eval_id,
1510
- _use_bar_indicator=_use_bar_indicator,
1511
- )
1849
+ test_case = LLMTestCase(
1850
+ input=golden.input,
1851
+ actual_output=(
1852
+ str(current_trace.output)
1853
+ if current_trace.output is not None
1854
+ else None
1855
+ ),
1856
+ expected_output=current_trace.expected_output,
1857
+ context=current_trace.context,
1858
+ retrieval_context=current_trace.retrieval_context,
1859
+ tools_called=current_trace.tools_called,
1860
+ expected_tools=current_trace.expected_tools,
1861
+ additional_metadata=golden.additional_metadata,
1862
+ comments=golden.comments,
1863
+ name=golden.name,
1864
+ _dataset_alias=golden._dataset_alias,
1865
+ _dataset_id=golden._dataset_id,
1866
+ )
1867
+ api_test_case = create_api_test_case(
1868
+ test_case=test_case,
1869
+ trace=trace_api,
1870
+ index=count if not _is_assert_test else None,
1871
+ )
1512
1872
 
1513
- async def dfs(trace: Trace, span: BaseSpan):
1514
- await _a_execute_span_test_case(
1515
- span=span,
1516
- current_trace=trace,
1873
+ await _a_execute_trace_test_case(
1874
+ trace=current_trace,
1517
1875
  trace_api=trace_api,
1518
1876
  api_test_case=api_test_case,
1519
1877
  ignore_errors=ignore_errors,
@@ -1522,56 +1880,150 @@ async def _a_execute_agentic_test_case(
1522
1880
  verbose_mode=verbose_mode,
1523
1881
  progress=progress,
1524
1882
  pbar_eval_id=pbar_eval_id,
1525
- test_run_manager=test_run_manager,
1526
1883
  _use_bar_indicator=_use_bar_indicator,
1527
1884
  )
1528
1885
 
1529
- if _skip_metrics_for_error(span=span, trace=trace):
1530
- return
1886
+ async def dfs(trace: Trace, span: BaseSpan):
1887
+ await _a_execute_span_test_case(
1888
+ span=span,
1889
+ current_trace=trace,
1890
+ trace_api=trace_api,
1891
+ api_test_case=api_test_case,
1892
+ ignore_errors=ignore_errors,
1893
+ skip_on_missing_params=skip_on_missing_params,
1894
+ show_indicator=show_indicator,
1895
+ verbose_mode=verbose_mode,
1896
+ progress=progress,
1897
+ pbar_eval_id=pbar_eval_id,
1898
+ test_run_manager=test_run_manager,
1899
+ _use_bar_indicator=_use_bar_indicator,
1900
+ )
1531
1901
 
1532
- child_tasks = [
1533
- asyncio.create_task(dfs(trace, child)) for child in span.children
1534
- ]
1535
- if child_tasks:
1536
- try:
1537
- await asyncio.wait_for(
1538
- asyncio.gather(*child_tasks),
1539
- timeout=_gather_timeout(),
1540
- )
1541
- except asyncio.TimeoutError:
1542
- for t in child_tasks:
1543
- if not t.done():
1544
- t.cancel()
1545
- await asyncio.gather(*child_tasks, return_exceptions=True)
1546
- raise
1902
+ if _skip_metrics_for_error(span=span, trace=trace):
1903
+ return
1547
1904
 
1548
- test_start_time = time.perf_counter()
1905
+ child_tasks = [
1906
+ asyncio.create_task(dfs(trace, child))
1907
+ for child in span.children
1908
+ ]
1909
+ if child_tasks:
1910
+ try:
1911
+ await asyncio.wait_for(
1912
+ asyncio.gather(*child_tasks),
1913
+ timeout=_gather_timeout(),
1914
+ )
1915
+ except (asyncio.TimeoutError, TimeoutError):
1916
+ for t in child_tasks:
1917
+ if not t.done():
1918
+ t.cancel()
1919
+ await asyncio.gather(*child_tasks, return_exceptions=True)
1920
+ raise
1549
1921
 
1550
- if not _skip_metrics_for_error(trace=current_trace):
1551
- if current_trace and current_trace.root_spans:
1552
- await dfs(current_trace, current_trace.root_spans[0])
1553
- else:
1554
- if (
1555
- logger.isEnabledFor(logging.DEBUG)
1556
- and get_settings().DEEPEVAL_VERBOSE_MODE
1557
- ):
1558
- logger.debug(
1559
- "Skipping DFS: empty trace or no root spans (trace=%s)",
1560
- current_trace.uuid if current_trace else None,
1922
+ if not _skip_metrics_for_error(trace=current_trace):
1923
+ if current_trace and current_trace.root_spans:
1924
+ await dfs(current_trace, current_trace.root_spans[0])
1925
+ else:
1926
+ if (
1927
+ logger.isEnabledFor(logging.DEBUG)
1928
+ and get_settings().DEEPEVAL_VERBOSE_MODE
1929
+ ):
1930
+ logger.debug(
1931
+ "Skipping DFS: empty trace or no root spans (trace=%s)",
1932
+ current_trace.uuid if current_trace else None,
1933
+ )
1934
+ except asyncio.CancelledError:
1935
+ # mark any unfinished metrics as cancelled
1936
+ cancel_msg = (
1937
+ "Timed out/cancelled while evaluating agentic test case. "
1938
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1939
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1940
+ )
1941
+
1942
+ if trace_metrics:
1943
+ for m in trace_metrics:
1944
+ if getattr(m, "skipped", False):
1945
+ continue
1946
+ if getattr(m, "success", None) is None and not getattr(
1947
+ m, "error", None
1948
+ ):
1949
+ m.success = False
1950
+ m.error = cancel_msg
1951
+
1952
+ if trace is not None and trace.metrics:
1953
+ for m in trace.metrics:
1954
+ if getattr(m, "skipped", False):
1955
+ continue
1956
+ if getattr(m, "success", None) is None and not getattr(
1957
+ m, "error", None
1958
+ ):
1959
+ m.success = False
1960
+ m.error = cancel_msg
1961
+ if not ignore_errors:
1962
+ raise
1963
+ finally:
1964
+ try:
1965
+ if api_test_case is None:
1966
+ if test_case is None:
1967
+ test_case = LLMTestCase(
1968
+ input=golden.input,
1969
+ actual_output=None,
1970
+ expected_output=None,
1971
+ context=None,
1972
+ retrieval_context=None,
1973
+ additional_metadata=golden.additional_metadata,
1974
+ tools_called=None,
1975
+ expected_tools=None,
1976
+ comments=golden.comments,
1977
+ name=golden.name,
1978
+ _dataset_alias=golden._dataset_alias,
1979
+ _dataset_id=golden._dataset_id,
1980
+ )
1981
+ if trace is not None and trace_api is None:
1982
+ trace_api = create_api_trace(trace, golden)
1983
+
1984
+ api_test_case = create_api_test_case(
1985
+ test_case=test_case,
1986
+ trace=trace_api,
1987
+ index=(count if not _is_assert_test else None),
1561
1988
  )
1562
1989
 
1563
- test_end_time = time.perf_counter()
1564
- run_duration = test_end_time - test_start_time
1990
+ # attach MetricData for any trace metrics we marked above
1991
+ if trace_metrics:
1992
+ for m in trace_metrics:
1993
+ if getattr(m, "skipped", False):
1994
+ continue
1995
+ api_test_case.update_metric_data(create_metric_data(m))
1996
+
1997
+ # If nothing set success yet, mark the case failed
1998
+ if api_test_case.success is None:
1999
+ api_test_case.update_status(False)
1565
2000
 
1566
- api_test_case.update_run_duration(run_duration)
1567
- test_run_manager.update_test_run(api_test_case, test_case)
1568
- main_result = create_test_result(api_test_case)
1569
- trace_results = extract_trace_test_results(trace_api)
1570
- unique_trace_results = filter_duplicate_results(main_result, trace_results)
1571
- test_results.append(main_result)
1572
- test_results.extend(unique_trace_results)
2001
+ # test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
2002
+ # Set it to None to ensure the test_case is added
2003
+ if api_test_case.metrics_data == [] and api_test_case.trace is None:
2004
+ api_test_case.metrics_data = None
1573
2005
 
1574
- update_pbar(progress, pbar_id)
2006
+ # Duration & persist
2007
+ test_end_time = time.perf_counter()
2008
+ run_duration = test_end_time - test_start_time
2009
+ api_test_case.update_run_duration(run_duration)
2010
+ test_run_manager.update_test_run(api_test_case, test_case)
2011
+
2012
+ # Build results and de-duplicate against trace results
2013
+ main_result = create_test_result(api_test_case)
2014
+ trace_results = (
2015
+ extract_trace_test_results(trace_api)
2016
+ if trace_api is not None
2017
+ else []
2018
+ )
2019
+ unique_trace_results = filter_duplicate_results(
2020
+ main_result, trace_results
2021
+ )
2022
+ test_results.append(main_result)
2023
+ test_results.extend(unique_trace_results)
2024
+ update_pbar(progress, pbar_id)
2025
+ finally:
2026
+ pass
1575
2027
 
1576
2028
 
1577
2029
  async def _a_execute_span_test_case(
@@ -2177,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
2177
2629
 
2178
2630
  async def execute_callback_with_semaphore(coroutine: Awaitable):
2179
2631
  async with semaphore:
2180
- return await asyncio.wait_for(
2181
- coroutine, timeout=_per_task_timeout()
2182
- )
2632
+ timeout = _per_task_timeout()
2633
+ return await _await_with_outer_deadline(coroutine, timeout=timeout)
2183
2634
 
2184
2635
  def evaluate_test_cases(
2185
2636
  progress: Optional[Progress] = None,
@@ -2328,16 +2779,25 @@ def a_execute_agentic_test_cases_from_loop(
2328
2779
  meta,
2329
2780
  )
2330
2781
  elif exc is not None:
2782
+
2783
+ show_trace = bool(
2784
+ get_settings().DEEPEVAL_LOG_STACK_TRACES
2785
+ )
2786
+ exc_info = (
2787
+ (
2788
+ type(exc),
2789
+ exc,
2790
+ getattr(exc, "__traceback__", None),
2791
+ )
2792
+ if show_trace
2793
+ else None
2794
+ )
2331
2795
  logger.error(
2332
2796
  "[deepeval] task ERROR %s after %.2fs meta=%r",
2333
2797
  t.get_name(),
2334
2798
  duration,
2335
2799
  meta,
2336
- exc_info=(
2337
- type(exc),
2338
- exc,
2339
- getattr(exc, "__traceback__", None),
2340
- ),
2800
+ exc_info=exc_info,
2341
2801
  )
2342
2802
  else:
2343
2803
  logger.info(
@@ -2396,7 +2856,7 @@ def a_execute_agentic_test_cases_from_loop(
2396
2856
  )
2397
2857
  )
2398
2858
 
2399
- except asyncio.TimeoutError:
2859
+ except (asyncio.TimeoutError, TimeoutError):
2400
2860
  import traceback
2401
2861
 
2402
2862
  pending = [t for t in created_tasks if not t.done()]
@@ -2609,9 +3069,9 @@ async def _a_evaluate_traces(
2609
3069
 
2610
3070
  async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
2611
3071
  async with semaphore:
2612
- return await asyncio.wait_for(
2613
- func(*args, **kwargs),
2614
- timeout=_per_task_timeout(),
3072
+ timeout = _per_task_timeout()
3073
+ return await _await_with_outer_deadline(
3074
+ func, *args, timeout=timeout, **kwargs
2615
3075
  )
2616
3076
 
2617
3077
  eval_tasks = []
@@ -2661,7 +3121,7 @@ async def _a_evaluate_traces(
2661
3121
  asyncio.gather(*eval_tasks),
2662
3122
  timeout=_gather_timeout(),
2663
3123
  )
2664
- except asyncio.TimeoutError:
3124
+ except (asyncio.TimeoutError, TimeoutError):
2665
3125
  for t in eval_tasks:
2666
3126
  if not t.done():
2667
3127
  t.cancel()
@@ -2689,9 +3149,9 @@ async def _evaluate_test_case_pairs(
2689
3149
 
2690
3150
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
2691
3151
  async with semaphore:
2692
- return await asyncio.wait_for(
2693
- func(*args, **kwargs),
2694
- timeout=_per_task_timeout(),
3152
+ timeout = _per_task_timeout()
3153
+ return await _await_with_outer_deadline(
3154
+ func, *args, timeout=timeout, **kwargs
2695
3155
  )
2696
3156
 
2697
3157
  tasks = []
@@ -2731,7 +3191,7 @@ async def _evaluate_test_case_pairs(
2731
3191
  asyncio.gather(*tasks),
2732
3192
  timeout=_gather_timeout(),
2733
3193
  )
2734
- except asyncio.TimeoutError:
3194
+ except (asyncio.TimeoutError, TimeoutError):
2735
3195
  # Cancel any still-pending tasks and drain them
2736
3196
  for t in tasks:
2737
3197
  if not t.done():
@@ -2756,6 +3216,9 @@ def _execute_metric(
2756
3216
  )
2757
3217
  except MissingTestCaseParamsError as e:
2758
3218
  if error_config.skip_on_missing_params:
3219
+ metric.skipped = True
3220
+ metric.error = None
3221
+ metric.success = None
2759
3222
  return "skip"
2760
3223
  else:
2761
3224
  if error_config.ignore_errors:
@@ -2768,6 +3231,9 @@ def _execute_metric(
2768
3231
  metric.measure(test_case)
2769
3232
  except MissingTestCaseParamsError as e:
2770
3233
  if error_config.skip_on_missing_params:
3234
+ metric.skipped = True
3235
+ metric.error = None
3236
+ metric.success = None
2771
3237
  return "skip"
2772
3238
  else:
2773
3239
  if error_config.ignore_errors: