deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -51,20 +51,16 @@ from deepeval.utils import (
51
51
  shorten,
52
52
  len_medium,
53
53
  format_error_text,
54
+ are_timeouts_disabled,
55
+ get_per_task_timeout_seconds,
56
+ get_gather_timeout_seconds,
57
+ get_gather_timeout,
54
58
  )
55
59
  from deepeval.telemetry import capture_evaluation_run
56
60
  from deepeval.metrics import (
57
61
  BaseMetric,
58
62
  BaseConversationalMetric,
59
- BaseMultimodalMetric,
60
63
  TaskCompletionMetric,
61
- # RAG metrics that support both single-turn and multimodal
62
- ContextualPrecisionMetric,
63
- ContextualRecallMetric,
64
- ContextualRelevancyMetric,
65
- AnswerRelevancyMetric,
66
- FaithfulnessMetric,
67
- ToolCorrectnessMetric,
68
64
  )
69
65
  from deepeval.metrics.indicator import (
70
66
  measure_metrics_with_indicator,
@@ -116,14 +112,56 @@ from deepeval.test_run.hyperparameters import (
116
112
 
117
113
  logger = logging.getLogger(__name__)
118
114
 
119
- MLLM_SUPPORTED_METRICS = [
120
- ContextualPrecisionMetric,
121
- ContextualRecallMetric,
122
- ContextualRelevancyMetric,
123
- AnswerRelevancyMetric,
124
- FaithfulnessMetric,
125
- ToolCorrectnessMetric,
126
- ]
115
+
116
+ def _timeout_msg(action: str, seconds: float) -> str:
117
+ if are_timeouts_disabled():
118
+ return (
119
+ f"Timeout occurred while {action} "
120
+ "(DeepEval timeouts are disabled; this likely came from the model/provider SDK or network layer). "
121
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
122
+ )
123
+ return (
124
+ f"Timed out after {seconds:.2f}s while {action}. "
125
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
126
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
127
+ )
128
+
129
+
130
+ def _log_gather_timeout(
131
+ logger,
132
+ *,
133
+ exc: Optional[BaseException] = None,
134
+ pending: Optional[int] = None,
135
+ ) -> None:
136
+ settings = get_settings()
137
+ if are_timeouts_disabled():
138
+ logger.warning(
139
+ "A task raised %s while waiting for gathered results; DeepEval gather/per-task timeouts are disabled%s. "
140
+ "This likely came from the model/provider SDK or network layer.",
141
+ type(exc).__name__ if exc else "TimeoutError",
142
+ f" (pending={pending})" if pending is not None else "",
143
+ exc_info=settings.DEEPEVAL_LOG_STACK_TRACES,
144
+ )
145
+ else:
146
+ if pending is not None:
147
+ logger.warning(
148
+ "Gather TIMEOUT after %.1fs; pending=%d tasks. "
149
+ "Some metrics may be marked as timed out. "
150
+ "To give tasks more time, consider increasing "
151
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
152
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
153
+ get_gather_timeout_seconds(),
154
+ pending,
155
+ )
156
+
157
+ else:
158
+ logger.warning(
159
+ "gather TIMEOUT after %.1fs. Some metrics may be marked as timed out. "
160
+ "To give tasks more time, consider increasing "
161
+ "DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
162
+ "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
163
+ get_gather_timeout_seconds(),
164
+ )
127
165
 
128
166
 
129
167
  def _skip_metrics_for_error(
@@ -234,18 +272,6 @@ async def _snapshot_tasks():
234
272
  return {t for t in asyncio.all_tasks() if t is not cur}
235
273
 
236
274
 
237
- def _per_task_timeout() -> float:
238
- return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
239
-
240
-
241
- def _gather_timeout() -> float:
242
- s = get_settings()
243
- return (
244
- s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
245
- + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
246
- )
247
-
248
-
249
275
  def filter_duplicate_results(
250
276
  main_result: TestResult, results: List[TestResult]
251
277
  ) -> List[TestResult]:
@@ -267,6 +293,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
267
293
  coro = obj
268
294
  else:
269
295
  coro = obj(*args, **kwargs)
296
+
297
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
298
+ return await coro
299
+
270
300
  return await asyncio.wait_for(coro, timeout=timeout)
271
301
  finally:
272
302
  reset_outer_deadline(token)
@@ -282,7 +312,6 @@ def execute_test_cases(
282
312
  metrics: Union[
283
313
  List[BaseMetric],
284
314
  List[BaseConversationalMetric],
285
- List[BaseMultimodalMetric],
286
315
  ],
287
316
  error_config: Optional[ErrorConfig] = ErrorConfig(),
288
317
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -315,17 +344,12 @@ def execute_test_cases(
315
344
 
316
345
  conversational_metrics: List[BaseConversationalMetric] = []
317
346
  llm_metrics: List[BaseMetric] = []
318
- mllm_metrics: List[BaseMultimodalMetric] = []
319
347
  for metric in metrics:
320
348
  metric.async_mode = False
321
349
  if isinstance(metric, BaseMetric):
322
350
  llm_metrics.append(metric)
323
- if type(metric) in MLLM_SUPPORTED_METRICS:
324
- mllm_metrics.append(metric)
325
351
  elif isinstance(metric, BaseConversationalMetric):
326
352
  conversational_metrics.append(metric)
327
- elif isinstance(metric, BaseMultimodalMetric):
328
- mllm_metrics.append(metric)
329
353
 
330
354
  test_results: List[TestResult] = []
331
355
 
@@ -333,23 +357,17 @@ def execute_test_cases(
333
357
  progress: Optional[Progress] = None, pbar_id: Optional[int] = None
334
358
  ):
335
359
  llm_test_case_count = -1
336
- mllm_test_case_count = -1
337
360
  conversational_test_case_count = -1
338
361
  show_metric_indicator = (
339
362
  display_config.show_indicator and not _use_bar_indicator
340
363
  )
341
364
  for i, test_case in enumerate(test_cases):
342
365
  # skip what we know we won't run
343
- if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
366
+ if isinstance(test_case, LLMTestCase):
344
367
  if not llm_metrics:
345
368
  update_pbar(progress, pbar_id)
346
369
  continue
347
370
  per_case_total = len(llm_metrics)
348
- elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
349
- if not mllm_metrics:
350
- update_pbar(progress, pbar_id)
351
- continue
352
- per_case_total = len(mllm_metrics)
353
371
  elif isinstance(test_case, ConversationalTestCase):
354
372
  if not conversational_metrics:
355
373
  update_pbar(progress, pbar_id)
@@ -364,56 +382,33 @@ def execute_test_cases(
364
382
 
365
383
  metrics_for_case = (
366
384
  llm_metrics
367
- if (
368
- isinstance(test_case, LLMTestCase)
369
- and not test_case.multimodal
370
- )
371
- else (
372
- mllm_metrics
373
- if (
374
- isinstance(test_case, LLMTestCase)
375
- and test_case.multimodal
376
- )
377
- else conversational_metrics
378
- )
385
+ if (isinstance(test_case, LLMTestCase))
386
+ else conversational_metrics
379
387
  )
380
388
  api_test_case = create_api_test_case(
381
389
  test_case=test_case,
382
390
  index=(
383
391
  llm_test_case_count + 1
384
- if (
385
- isinstance(test_case, LLMTestCase)
386
- and not test_case.multimodal
387
- )
388
- else (
389
- mllm_test_case_count + 1
390
- if (
391
- isinstance(test_case, LLMTestCase)
392
- and test_case.multimodal
393
- )
394
- else conversational_test_case_count + 1
395
- )
392
+ if (isinstance(test_case, LLMTestCase))
393
+ else (conversational_test_case_count + 1)
396
394
  ),
397
395
  )
398
396
  emitted = [False] * len(metrics_for_case)
399
397
  index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
400
398
  current_index = -1
401
399
  start_time = time.perf_counter()
402
- deadline_timeout = _per_task_timeout()
400
+ deadline_timeout = get_per_task_timeout_seconds()
403
401
  deadline_token = set_outer_deadline(deadline_timeout)
404
402
  new_cached_test_case: CachedTestCase = None
405
403
  try:
406
404
 
407
405
  def _run_case():
408
- nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
406
+ nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
409
407
  with capture_evaluation_run("test case"):
410
408
  for metric in metrics:
411
409
  metric.error = None # Reset metric error
412
410
 
413
- if (
414
- isinstance(test_case, LLMTestCase)
415
- and not test_case.multimodal
416
- ):
411
+ if isinstance(test_case, LLMTestCase):
417
412
  llm_test_case_count += 1
418
413
  cached_test_case = None
419
414
  if cache_config.use_cache:
@@ -465,29 +460,6 @@ def execute_test_cases(
465
460
  )
466
461
  update_pbar(progress, pbar_test_case_id)
467
462
 
468
- # No caching and not sending test cases to Confident AI for multimodal metrics yet
469
- elif (
470
- isinstance(test_case, LLMTestCase)
471
- and test_case.multimodal
472
- ):
473
- mllm_test_case_count += 1
474
- for metric in mllm_metrics:
475
- current_index = index_of[id(metric)]
476
- res = _execute_metric(
477
- metric=metric,
478
- test_case=test_case,
479
- show_metric_indicator=show_metric_indicator,
480
- in_component=False,
481
- error_config=error_config,
482
- )
483
- if res == "skip":
484
- continue
485
-
486
- metric_data = create_metric_data(metric)
487
- api_test_case.update_metric_data(metric_data)
488
- emitted[current_index] = True
489
- update_pbar(progress, pbar_test_case_id)
490
-
491
463
  # No caching for conversational metrics yet
492
464
  elif isinstance(test_case, ConversationalTestCase):
493
465
  conversational_test_case_count += 1
@@ -510,25 +482,20 @@ def execute_test_cases(
510
482
 
511
483
  run_sync_with_timeout(_run_case, deadline_timeout)
512
484
  except (asyncio.TimeoutError, TimeoutError):
513
- msg = (
514
- f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
515
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
516
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
517
- )
518
- for i, m in enumerate(metrics_for_case):
519
- if getattr(m, "skipped", False):
485
+
486
+ msg = _timeout_msg("evaluating metric", deadline_timeout)
487
+ for i, metric in enumerate(metrics_for_case):
488
+ if metric.skipped:
520
489
  continue
521
490
  # already finished or errored? leave it
522
- if getattr(m, "success", None) is not None or getattr(
523
- m, "error", None
524
- ):
491
+ if metric.success is not None or metric.error is not None:
525
492
  continue
526
493
  if i == current_index:
527
- m.success = False
528
- m.error = msg
494
+ metric.success = False
495
+ metric.error = msg
529
496
  elif i > current_index:
530
- m.success = False
531
- m.error = "Skipped due to case timeout."
497
+ metric.success = False
498
+ metric.error = "Skipped due to case timeout."
532
499
 
533
500
  if not error_config.ignore_errors:
534
501
  raise
@@ -553,12 +520,12 @@ def execute_test_cases(
553
520
  )
554
521
 
555
522
  # Attach MetricData for *all* metrics (finished or synthesized)
556
- for i, m in enumerate(metrics_for_case):
557
- if getattr(m, "skipped", False):
523
+ for i, metric in enumerate(metrics_for_case):
524
+ if metric.skipped:
558
525
  continue
559
526
  if not emitted[i]:
560
527
  api_test_case.update_metric_data(
561
- create_metric_data(m)
528
+ create_metric_data(metric)
562
529
  )
563
530
 
564
531
  elapsed = time.perf_counter() - start_time
@@ -597,7 +564,6 @@ async def a_execute_test_cases(
597
564
  metrics: Union[
598
565
  List[BaseMetric],
599
566
  List[BaseConversationalMetric],
600
- List[BaseMultimodalMetric],
601
567
  ],
602
568
  error_config: Optional[ErrorConfig] = ErrorConfig(),
603
569
  display_config: Optional[DisplayConfig] = DisplayConfig(),
@@ -612,9 +578,8 @@ async def a_execute_test_cases(
612
578
 
613
579
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
614
580
  async with semaphore:
615
- timeout = _per_task_timeout()
616
581
  return await _await_with_outer_deadline(
617
- func, *args, timeout=timeout, **kwargs
582
+ func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
618
583
  )
619
584
 
620
585
  global_test_run_cache_manager.disable_write_cache = (
@@ -631,20 +596,14 @@ async def a_execute_test_cases(
631
596
  metric.verbose_mode = display_config.verbose_mode
632
597
 
633
598
  llm_metrics: List[BaseMetric] = []
634
- mllm_metrics: List[BaseMultimodalMetric] = []
635
599
  conversational_metrics: List[BaseConversationalMetric] = []
636
600
  for metric in metrics:
637
601
  if isinstance(metric, BaseMetric):
638
602
  llm_metrics.append(metric)
639
- if type(metric) in MLLM_SUPPORTED_METRICS:
640
- mllm_metrics.append(metric)
641
- elif isinstance(metric, BaseMultimodalMetric):
642
- mllm_metrics.append(metric)
643
603
  elif isinstance(metric, BaseConversationalMetric):
644
604
  conversational_metrics.append(metric)
645
605
 
646
606
  llm_test_case_counter = -1
647
- mllm_test_case_counter = -1
648
607
  conversational_test_case_counter = -1
649
608
  test_results: List[Union[TestResult, LLMTestCase]] = []
650
609
  tasks = []
@@ -665,10 +624,7 @@ async def a_execute_test_cases(
665
624
  with progress:
666
625
  for test_case in test_cases:
667
626
  with capture_evaluation_run("test case"):
668
- if (
669
- isinstance(test_case, LLMTestCase)
670
- and not test_case.multimodal
671
- ):
627
+ if isinstance(test_case, LLMTestCase):
672
628
  if len(llm_metrics) == 0:
673
629
  update_pbar(progress, pbar_id)
674
630
  continue
@@ -696,31 +652,6 @@ async def a_execute_test_cases(
696
652
  )
697
653
  tasks.append(asyncio.create_task(task))
698
654
 
699
- elif (
700
- isinstance(test_case, LLMTestCase)
701
- and test_case.multimodal
702
- ):
703
- mllm_test_case_counter += 1
704
- copied_multimodal_metrics: List[
705
- BaseMultimodalMetric
706
- ] = copy_metrics(mllm_metrics)
707
- task = execute_with_semaphore(
708
- func=_a_execute_mllm_test_cases,
709
- metrics=copied_multimodal_metrics,
710
- test_case=test_case,
711
- test_run_manager=test_run_manager,
712
- test_results=test_results,
713
- count=mllm_test_case_counter,
714
- ignore_errors=error_config.ignore_errors,
715
- skip_on_missing_params=error_config.skip_on_missing_params,
716
- show_indicator=display_config.show_indicator,
717
- _use_bar_indicator=_use_bar_indicator,
718
- _is_assert_test=_is_assert_test,
719
- progress=progress,
720
- pbar_id=pbar_id,
721
- )
722
- tasks.append(asyncio.create_task(task))
723
-
724
655
  elif isinstance(test_case, ConversationalTestCase):
725
656
  conversational_test_case_counter += 1
726
657
 
@@ -746,27 +677,23 @@ async def a_execute_test_cases(
746
677
  try:
747
678
  await asyncio.wait_for(
748
679
  asyncio.gather(*tasks),
749
- timeout=_gather_timeout(),
680
+ timeout=get_gather_timeout(),
750
681
  )
751
- except (asyncio.TimeoutError, TimeoutError):
682
+ except (asyncio.TimeoutError, TimeoutError) as e:
752
683
  for t in tasks:
753
684
  if not t.done():
754
685
  t.cancel()
755
686
  await asyncio.gather(*tasks, return_exceptions=True)
756
- logging.getLogger("deepeval").error(
757
- "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
758
- _gather_timeout(),
759
- )
687
+
688
+ _log_gather_timeout(logger, exc=e)
689
+
760
690
  if not error_config.ignore_errors:
761
691
  raise
762
692
 
763
693
  else:
764
694
  for test_case in test_cases:
765
695
  with capture_evaluation_run("test case"):
766
- if (
767
- isinstance(test_case, LLMTestCase)
768
- and not test_case.multimodal
769
- ):
696
+ if isinstance(test_case, LLMTestCase):
770
697
  if len(llm_metrics) == 0:
771
698
  continue
772
699
  llm_test_case_counter += 1
@@ -814,34 +741,12 @@ async def a_execute_test_cases(
814
741
  )
815
742
  tasks.append(asyncio.create_task((task)))
816
743
 
817
- elif (
818
- isinstance(test_case, LLMTestCase) and test_case.multimodal
819
- ):
820
- mllm_test_case_counter += 1
821
- copied_multimodal_metrics: List[BaseMultimodalMetric] = (
822
- copy_metrics(mllm_metrics)
823
- )
824
- task = execute_with_semaphore(
825
- func=_a_execute_mllm_test_cases,
826
- metrics=copied_multimodal_metrics,
827
- test_case=test_case,
828
- test_run_manager=test_run_manager,
829
- test_results=test_results,
830
- count=mllm_test_case_counter,
831
- ignore_errors=error_config.ignore_errors,
832
- skip_on_missing_params=error_config.skip_on_missing_params,
833
- _use_bar_indicator=_use_bar_indicator,
834
- _is_assert_test=_is_assert_test,
835
- show_indicator=display_config.show_indicator,
836
- )
837
- tasks.append(asyncio.create_task(task))
838
-
839
744
  await asyncio.sleep(async_config.throttle_value)
840
745
 
841
746
  try:
842
747
  await asyncio.wait_for(
843
748
  asyncio.gather(*tasks),
844
- timeout=_gather_timeout(),
749
+ timeout=get_gather_timeout(),
845
750
  )
846
751
  except (asyncio.TimeoutError, TimeoutError):
847
752
  # Cancel any still-pending tasks and drain them
@@ -910,11 +815,18 @@ async def _a_execute_llm_test_cases(
910
815
  progress=progress,
911
816
  )
912
817
  except asyncio.CancelledError:
913
- msg = (
914
- "Timed out/cancelled while evaluating metric. "
915
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
916
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
917
- )
818
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
819
+ msg = (
820
+ "Cancelled while evaluating metric. "
821
+ "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
822
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
823
+ )
824
+ else:
825
+ msg = (
826
+ "Timed out/cancelled while evaluating metric. "
827
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
828
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
829
+ )
918
830
  for m in metrics:
919
831
  if getattr(m, "skipped", False):
920
832
  continue
@@ -976,85 +888,8 @@ async def _a_execute_llm_test_cases(
976
888
  update_pbar(progress, pbar_id)
977
889
 
978
890
 
979
- async def _a_execute_mllm_test_cases(
980
- metrics: List[BaseMultimodalMetric],
981
- test_case: LLMTestCase,
982
- test_run_manager: TestRunManager,
983
- test_results: List[Union[TestResult, LLMTestCase]],
984
- count: int,
985
- ignore_errors: bool,
986
- skip_on_missing_params: bool,
987
- show_indicator: bool,
988
- _use_bar_indicator: bool,
989
- _is_assert_test: bool,
990
- progress: Optional[Progress] = None,
991
- pbar_id: Optional[int] = None,
992
- ):
993
- show_metrics_indicator = show_indicator and not _use_bar_indicator
994
- pbar_test_case_id = add_pbar(
995
- progress,
996
- f" 🎯 Evaluating test case #{count}",
997
- total=len(metrics),
998
- )
999
-
1000
- for metric in metrics:
1001
- metric.skipped = False
1002
- metric.error = None # Reset metric error
1003
-
1004
- api_test_case: LLMApiTestCase = create_api_test_case(
1005
- test_case=test_case, index=count if not _is_assert_test else None
1006
- )
1007
- test_start_time = time.perf_counter()
1008
- try:
1009
- await measure_metrics_with_indicator(
1010
- metrics=metrics,
1011
- test_case=test_case,
1012
- cached_test_case=None,
1013
- skip_on_missing_params=skip_on_missing_params,
1014
- ignore_errors=ignore_errors,
1015
- show_indicator=show_metrics_indicator,
1016
- pbar_eval_id=pbar_test_case_id,
1017
- progress=progress,
1018
- )
1019
- except asyncio.CancelledError:
1020
- msg = (
1021
- "Timed out/cancelled while evaluating metric. "
1022
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1023
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1024
- )
1025
- for m in metrics:
1026
- if getattr(m, "skipped", False):
1027
- continue
1028
- # If the task never finished and didn't set a terminal state, mark it now
1029
- if getattr(m, "success", None) is None and not getattr(
1030
- m, "error", None
1031
- ):
1032
- m.success = False
1033
- m.error = msg
1034
- if not ignore_errors:
1035
- raise
1036
- finally:
1037
- for metric in metrics:
1038
- if metric.skipped:
1039
- continue
1040
-
1041
- metric_data = create_metric_data(metric)
1042
- api_test_case.update_metric_data(metric_data)
1043
-
1044
- test_end_time = time.perf_counter()
1045
- run_duration = test_end_time - test_start_time
1046
- api_test_case.update_run_duration(run_duration)
1047
-
1048
- ### Update Test Run ###
1049
- test_run_manager.update_test_run(api_test_case, test_case)
1050
- test_results.append(create_test_result(api_test_case))
1051
- update_pbar(progress, pbar_id)
1052
-
1053
-
1054
891
  async def _a_execute_conversational_test_cases(
1055
- metrics: List[
1056
- Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
1057
- ],
892
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
1058
893
  test_case: ConversationalTestCase,
1059
894
  test_run_manager: TestRunManager,
1060
895
  test_results: List[Union[TestResult, LLMTestCase]],
@@ -1097,11 +932,18 @@ async def _a_execute_conversational_test_cases(
1097
932
  )
1098
933
 
1099
934
  except asyncio.CancelledError:
1100
- msg = (
1101
- "Timed out/cancelled while evaluating metric. "
1102
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1103
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1104
- )
935
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
936
+ msg = (
937
+ "Cancelled while evaluating metric. "
938
+ "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
939
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
940
+ )
941
+ else:
942
+ msg = (
943
+ "Timed out/cancelled while evaluating metric. "
944
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
945
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
946
+ )
1105
947
  for m in metrics:
1106
948
  if getattr(m, "skipped", False):
1107
949
  continue
@@ -1211,7 +1053,7 @@ def execute_agentic_test_cases(
1211
1053
  loop.run_until_complete(
1212
1054
  _await_with_outer_deadline(
1213
1055
  coro,
1214
- timeout=_per_task_timeout(),
1056
+ timeout=get_per_task_timeout_seconds(),
1215
1057
  )
1216
1058
  )
1217
1059
  else:
@@ -1538,17 +1380,13 @@ def execute_agentic_test_cases(
1538
1380
 
1539
1381
  # run the golden with a timeout
1540
1382
  start_time = time.perf_counter()
1541
- deadline = _per_task_timeout()
1383
+ deadline = get_per_task_timeout_seconds()
1542
1384
 
1543
1385
  try:
1544
1386
  run_sync_with_timeout(_run_golden, deadline)
1545
1387
  except (asyncio.TimeoutError, TimeoutError):
1546
1388
  # mark any not yet finished trace level and span level metrics as timed out.
1547
- msg = (
1548
- f"Timed out after {deadline:.2f}s while executing agentic test case. "
1549
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1550
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1551
- )
1389
+ msg = _timeout_msg("executing agentic test case", deadline)
1552
1390
 
1553
1391
  if current_trace is not None:
1554
1392
  # Trace-level metrics
@@ -1729,9 +1567,8 @@ async def a_execute_agentic_test_cases(
1729
1567
 
1730
1568
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
1731
1569
  async with semaphore:
1732
- timeout = _per_task_timeout()
1733
1570
  return await _await_with_outer_deadline(
1734
- func, *args, timeout=timeout, **kwargs
1571
+ func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
1735
1572
  )
1736
1573
 
1737
1574
  test_run_manager = global_test_run_manager
@@ -1782,7 +1619,7 @@ async def a_execute_agentic_test_cases(
1782
1619
  try:
1783
1620
  await asyncio.wait_for(
1784
1621
  asyncio.gather(*tasks),
1785
- timeout=_gather_timeout(),
1622
+ timeout=get_gather_timeout(),
1786
1623
  )
1787
1624
  except (asyncio.TimeoutError, TimeoutError):
1788
1625
  # Cancel any still-pending tasks and drain them
@@ -1863,7 +1700,7 @@ async def _a_execute_agentic_test_case(
1863
1700
  await _await_with_outer_deadline(
1864
1701
  observed_callback,
1865
1702
  golden.input,
1866
- timeout=_per_task_timeout(),
1703
+ timeout=get_per_task_timeout_seconds(),
1867
1704
  )
1868
1705
  else:
1869
1706
  observed_callback(golden.input)
@@ -1957,7 +1794,7 @@ async def _a_execute_agentic_test_case(
1957
1794
  try:
1958
1795
  await asyncio.wait_for(
1959
1796
  asyncio.gather(*child_tasks),
1960
- timeout=_gather_timeout(),
1797
+ timeout=get_gather_timeout(),
1961
1798
  )
1962
1799
  except (asyncio.TimeoutError, TimeoutError):
1963
1800
  for t in child_tasks:
@@ -1980,11 +1817,18 @@ async def _a_execute_agentic_test_case(
1980
1817
  )
1981
1818
  except asyncio.CancelledError:
1982
1819
  # mark any unfinished metrics as cancelled
1983
- cancel_msg = (
1984
- "Timed out/cancelled while evaluating agentic test case. "
1985
- "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1986
- "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1987
- )
1820
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
1821
+ cancel_msg = (
1822
+ "Cancelled while evaluating agentic test case. "
1823
+ "(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
1824
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1825
+ )
1826
+ else:
1827
+ cancel_msg = (
1828
+ "Timed out/cancelled while evaluating agentic test case. "
1829
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
1830
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
1831
+ )
1988
1832
 
1989
1833
  if trace_metrics:
1990
1834
  for m in trace_metrics:
@@ -2676,8 +2520,9 @@ def a_execute_agentic_test_cases_from_loop(
2676
2520
 
2677
2521
  async def execute_callback_with_semaphore(coroutine: Awaitable):
2678
2522
  async with semaphore:
2679
- timeout = _per_task_timeout()
2680
- return await _await_with_outer_deadline(coroutine, timeout=timeout)
2523
+ return await _await_with_outer_deadline(
2524
+ coroutine, timeout=get_per_task_timeout_seconds()
2525
+ )
2681
2526
 
2682
2527
  def evaluate_test_cases(
2683
2528
  progress: Optional[Progress] = None,
@@ -2899,15 +2744,18 @@ def a_execute_agentic_test_cases_from_loop(
2899
2744
  loop.run_until_complete(
2900
2745
  asyncio.wait_for(
2901
2746
  asyncio.gather(*created_tasks, return_exceptions=True),
2902
- timeout=_gather_timeout(),
2747
+ timeout=get_gather_timeout(),
2903
2748
  )
2904
2749
  )
2905
2750
 
2906
- except (asyncio.TimeoutError, TimeoutError):
2751
+ except (asyncio.TimeoutError, TimeoutError) as e:
2907
2752
  import traceback
2908
2753
 
2754
+ settings = get_settings()
2909
2755
  pending = [t for t in created_tasks if not t.done()]
2910
2756
 
2757
+ _log_gather_timeout(logger, exc=e, pending=len(pending))
2758
+
2911
2759
  # Log the elapsed time for each task that was pending
2912
2760
  for t in pending:
2913
2761
  meta = task_meta.get(t, {})
@@ -2915,26 +2763,27 @@ def a_execute_agentic_test_cases_from_loop(
2915
2763
  elapsed_time = time.perf_counter() - start_time
2916
2764
 
2917
2765
  # Determine if it was a per task or gather timeout based on task's elapsed time
2918
- if elapsed_time >= _per_task_timeout():
2919
- timeout_type = "per-task"
2766
+ if not settings.DEEPEVAL_DISABLE_TIMEOUTS:
2767
+ timeout_type = (
2768
+ "per-task"
2769
+ if elapsed_time >= get_per_task_timeout_seconds()
2770
+ else "gather"
2771
+ )
2772
+ logger.info(
2773
+ " - PENDING %s elapsed_time=%.2fs timeout_type=%s meta=%s",
2774
+ t.get_name(),
2775
+ elapsed_time,
2776
+ timeout_type,
2777
+ meta,
2778
+ )
2920
2779
  else:
2921
- timeout_type = "gather"
2922
-
2923
- logger.warning(
2924
- f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
2925
- f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
2926
- f"To give tasks more time, consider increasing "
2927
- f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
2928
- f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
2929
- )
2780
+ logger.info(
2781
+ " - PENDING %s elapsed_time=%.2fs meta=%s",
2782
+ t.get_name(),
2783
+ elapsed_time,
2784
+ meta,
2785
+ )
2930
2786
 
2931
- # Log pending tasks and their stack traces
2932
- logger.info(
2933
- " - PENDING %s elapsed_time=%.2fs meta=%s",
2934
- t.get_name(),
2935
- elapsed_time,
2936
- meta,
2937
- )
2938
2787
  if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
2939
2788
  frames = t.get_stack(limit=6)
2940
2789
  if frames:
@@ -3116,9 +2965,8 @@ async def _a_evaluate_traces(
3116
2965
 
3117
2966
  async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
3118
2967
  async with semaphore:
3119
- timeout = _per_task_timeout()
3120
2968
  return await _await_with_outer_deadline(
3121
- func, *args, timeout=timeout, **kwargs
2969
+ func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
3122
2970
  )
3123
2971
 
3124
2972
  eval_tasks = []
@@ -3166,7 +3014,7 @@ async def _a_evaluate_traces(
3166
3014
  try:
3167
3015
  await asyncio.wait_for(
3168
3016
  asyncio.gather(*eval_tasks),
3169
- timeout=_gather_timeout(),
3017
+ timeout=get_gather_timeout(),
3170
3018
  )
3171
3019
  except (asyncio.TimeoutError, TimeoutError):
3172
3020
  for t in eval_tasks:
@@ -3196,9 +3044,8 @@ async def _evaluate_test_case_pairs(
3196
3044
 
3197
3045
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
3198
3046
  async with semaphore:
3199
- timeout = _per_task_timeout()
3200
3047
  return await _await_with_outer_deadline(
3201
- func, *args, timeout=timeout, **kwargs
3048
+ func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
3202
3049
  )
3203
3050
 
3204
3051
  tasks = []
@@ -3236,7 +3083,7 @@ async def _evaluate_test_case_pairs(
3236
3083
  try:
3237
3084
  await asyncio.wait_for(
3238
3085
  asyncio.gather(*tasks),
3239
- timeout=_gather_timeout(),
3086
+ timeout=get_gather_timeout(),
3240
3087
  )
3241
3088
  except (asyncio.TimeoutError, TimeoutError):
3242
3089
  # Cancel any still-pending tasks and drain them