deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -51,20 +51,16 @@ from deepeval.utils import (
|
|
|
51
51
|
shorten,
|
|
52
52
|
len_medium,
|
|
53
53
|
format_error_text,
|
|
54
|
+
are_timeouts_disabled,
|
|
55
|
+
get_per_task_timeout_seconds,
|
|
56
|
+
get_gather_timeout_seconds,
|
|
57
|
+
get_gather_timeout,
|
|
54
58
|
)
|
|
55
59
|
from deepeval.telemetry import capture_evaluation_run
|
|
56
60
|
from deepeval.metrics import (
|
|
57
61
|
BaseMetric,
|
|
58
62
|
BaseConversationalMetric,
|
|
59
|
-
BaseMultimodalMetric,
|
|
60
63
|
TaskCompletionMetric,
|
|
61
|
-
# RAG metrics that support both single-turn and multimodal
|
|
62
|
-
ContextualPrecisionMetric,
|
|
63
|
-
ContextualRecallMetric,
|
|
64
|
-
ContextualRelevancyMetric,
|
|
65
|
-
AnswerRelevancyMetric,
|
|
66
|
-
FaithfulnessMetric,
|
|
67
|
-
ToolCorrectnessMetric,
|
|
68
64
|
)
|
|
69
65
|
from deepeval.metrics.indicator import (
|
|
70
66
|
measure_metrics_with_indicator,
|
|
@@ -116,14 +112,56 @@ from deepeval.test_run.hyperparameters import (
|
|
|
116
112
|
|
|
117
113
|
logger = logging.getLogger(__name__)
|
|
118
114
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
115
|
+
|
|
116
|
+
def _timeout_msg(action: str, seconds: float) -> str:
|
|
117
|
+
if are_timeouts_disabled():
|
|
118
|
+
return (
|
|
119
|
+
f"Timeout occurred while {action} "
|
|
120
|
+
"(DeepEval timeouts are disabled; this likely came from the model/provider SDK or network layer). "
|
|
121
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
122
|
+
)
|
|
123
|
+
return (
|
|
124
|
+
f"Timed out after {seconds:.2f}s while {action}. "
|
|
125
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
126
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _log_gather_timeout(
|
|
131
|
+
logger,
|
|
132
|
+
*,
|
|
133
|
+
exc: Optional[BaseException] = None,
|
|
134
|
+
pending: Optional[int] = None,
|
|
135
|
+
) -> None:
|
|
136
|
+
settings = get_settings()
|
|
137
|
+
if are_timeouts_disabled():
|
|
138
|
+
logger.warning(
|
|
139
|
+
"A task raised %s while waiting for gathered results; DeepEval gather/per-task timeouts are disabled%s. "
|
|
140
|
+
"This likely came from the model/provider SDK or network layer.",
|
|
141
|
+
type(exc).__name__ if exc else "TimeoutError",
|
|
142
|
+
f" (pending={pending})" if pending is not None else "",
|
|
143
|
+
exc_info=settings.DEEPEVAL_LOG_STACK_TRACES,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
if pending is not None:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"Gather TIMEOUT after %.1fs; pending=%d tasks. "
|
|
149
|
+
"Some metrics may be marked as timed out. "
|
|
150
|
+
"To give tasks more time, consider increasing "
|
|
151
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
|
|
152
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
|
|
153
|
+
get_gather_timeout_seconds(),
|
|
154
|
+
pending,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
else:
|
|
158
|
+
logger.warning(
|
|
159
|
+
"gather TIMEOUT after %.1fs. Some metrics may be marked as timed out. "
|
|
160
|
+
"To give tasks more time, consider increasing "
|
|
161
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
|
|
162
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
|
|
163
|
+
get_gather_timeout_seconds(),
|
|
164
|
+
)
|
|
127
165
|
|
|
128
166
|
|
|
129
167
|
def _skip_metrics_for_error(
|
|
@@ -234,18 +272,6 @@ async def _snapshot_tasks():
|
|
|
234
272
|
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
235
273
|
|
|
236
274
|
|
|
237
|
-
def _per_task_timeout() -> float:
|
|
238
|
-
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
def _gather_timeout() -> float:
|
|
242
|
-
s = get_settings()
|
|
243
|
-
return (
|
|
244
|
-
s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
245
|
-
+ s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
|
|
249
275
|
def filter_duplicate_results(
|
|
250
276
|
main_result: TestResult, results: List[TestResult]
|
|
251
277
|
) -> List[TestResult]:
|
|
@@ -267,6 +293,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
|
267
293
|
coro = obj
|
|
268
294
|
else:
|
|
269
295
|
coro = obj(*args, **kwargs)
|
|
296
|
+
|
|
297
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
298
|
+
return await coro
|
|
299
|
+
|
|
270
300
|
return await asyncio.wait_for(coro, timeout=timeout)
|
|
271
301
|
finally:
|
|
272
302
|
reset_outer_deadline(token)
|
|
@@ -282,7 +312,6 @@ def execute_test_cases(
|
|
|
282
312
|
metrics: Union[
|
|
283
313
|
List[BaseMetric],
|
|
284
314
|
List[BaseConversationalMetric],
|
|
285
|
-
List[BaseMultimodalMetric],
|
|
286
315
|
],
|
|
287
316
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
288
317
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -315,17 +344,12 @@ def execute_test_cases(
|
|
|
315
344
|
|
|
316
345
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
317
346
|
llm_metrics: List[BaseMetric] = []
|
|
318
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
319
347
|
for metric in metrics:
|
|
320
348
|
metric.async_mode = False
|
|
321
349
|
if isinstance(metric, BaseMetric):
|
|
322
350
|
llm_metrics.append(metric)
|
|
323
|
-
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
324
|
-
mllm_metrics.append(metric)
|
|
325
351
|
elif isinstance(metric, BaseConversationalMetric):
|
|
326
352
|
conversational_metrics.append(metric)
|
|
327
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
328
|
-
mllm_metrics.append(metric)
|
|
329
353
|
|
|
330
354
|
test_results: List[TestResult] = []
|
|
331
355
|
|
|
@@ -333,23 +357,17 @@ def execute_test_cases(
|
|
|
333
357
|
progress: Optional[Progress] = None, pbar_id: Optional[int] = None
|
|
334
358
|
):
|
|
335
359
|
llm_test_case_count = -1
|
|
336
|
-
mllm_test_case_count = -1
|
|
337
360
|
conversational_test_case_count = -1
|
|
338
361
|
show_metric_indicator = (
|
|
339
362
|
display_config.show_indicator and not _use_bar_indicator
|
|
340
363
|
)
|
|
341
364
|
for i, test_case in enumerate(test_cases):
|
|
342
365
|
# skip what we know we won't run
|
|
343
|
-
if isinstance(test_case, LLMTestCase)
|
|
366
|
+
if isinstance(test_case, LLMTestCase):
|
|
344
367
|
if not llm_metrics:
|
|
345
368
|
update_pbar(progress, pbar_id)
|
|
346
369
|
continue
|
|
347
370
|
per_case_total = len(llm_metrics)
|
|
348
|
-
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
349
|
-
if not mllm_metrics:
|
|
350
|
-
update_pbar(progress, pbar_id)
|
|
351
|
-
continue
|
|
352
|
-
per_case_total = len(mllm_metrics)
|
|
353
371
|
elif isinstance(test_case, ConversationalTestCase):
|
|
354
372
|
if not conversational_metrics:
|
|
355
373
|
update_pbar(progress, pbar_id)
|
|
@@ -364,56 +382,33 @@ def execute_test_cases(
|
|
|
364
382
|
|
|
365
383
|
metrics_for_case = (
|
|
366
384
|
llm_metrics
|
|
367
|
-
if (
|
|
368
|
-
|
|
369
|
-
and not test_case.multimodal
|
|
370
|
-
)
|
|
371
|
-
else (
|
|
372
|
-
mllm_metrics
|
|
373
|
-
if (
|
|
374
|
-
isinstance(test_case, LLMTestCase)
|
|
375
|
-
and test_case.multimodal
|
|
376
|
-
)
|
|
377
|
-
else conversational_metrics
|
|
378
|
-
)
|
|
385
|
+
if (isinstance(test_case, LLMTestCase))
|
|
386
|
+
else conversational_metrics
|
|
379
387
|
)
|
|
380
388
|
api_test_case = create_api_test_case(
|
|
381
389
|
test_case=test_case,
|
|
382
390
|
index=(
|
|
383
391
|
llm_test_case_count + 1
|
|
384
|
-
if (
|
|
385
|
-
|
|
386
|
-
and not test_case.multimodal
|
|
387
|
-
)
|
|
388
|
-
else (
|
|
389
|
-
mllm_test_case_count + 1
|
|
390
|
-
if (
|
|
391
|
-
isinstance(test_case, LLMTestCase)
|
|
392
|
-
and test_case.multimodal
|
|
393
|
-
)
|
|
394
|
-
else conversational_test_case_count + 1
|
|
395
|
-
)
|
|
392
|
+
if (isinstance(test_case, LLMTestCase))
|
|
393
|
+
else (conversational_test_case_count + 1)
|
|
396
394
|
),
|
|
397
395
|
)
|
|
398
396
|
emitted = [False] * len(metrics_for_case)
|
|
399
397
|
index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
|
|
400
398
|
current_index = -1
|
|
401
399
|
start_time = time.perf_counter()
|
|
402
|
-
deadline_timeout =
|
|
400
|
+
deadline_timeout = get_per_task_timeout_seconds()
|
|
403
401
|
deadline_token = set_outer_deadline(deadline_timeout)
|
|
404
402
|
new_cached_test_case: CachedTestCase = None
|
|
405
403
|
try:
|
|
406
404
|
|
|
407
405
|
def _run_case():
|
|
408
|
-
nonlocal new_cached_test_case, current_index, llm_test_case_count,
|
|
406
|
+
nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
|
|
409
407
|
with capture_evaluation_run("test case"):
|
|
410
408
|
for metric in metrics:
|
|
411
409
|
metric.error = None # Reset metric error
|
|
412
410
|
|
|
413
|
-
if (
|
|
414
|
-
isinstance(test_case, LLMTestCase)
|
|
415
|
-
and not test_case.multimodal
|
|
416
|
-
):
|
|
411
|
+
if isinstance(test_case, LLMTestCase):
|
|
417
412
|
llm_test_case_count += 1
|
|
418
413
|
cached_test_case = None
|
|
419
414
|
if cache_config.use_cache:
|
|
@@ -465,29 +460,6 @@ def execute_test_cases(
|
|
|
465
460
|
)
|
|
466
461
|
update_pbar(progress, pbar_test_case_id)
|
|
467
462
|
|
|
468
|
-
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
469
|
-
elif (
|
|
470
|
-
isinstance(test_case, LLMTestCase)
|
|
471
|
-
and test_case.multimodal
|
|
472
|
-
):
|
|
473
|
-
mllm_test_case_count += 1
|
|
474
|
-
for metric in mllm_metrics:
|
|
475
|
-
current_index = index_of[id(metric)]
|
|
476
|
-
res = _execute_metric(
|
|
477
|
-
metric=metric,
|
|
478
|
-
test_case=test_case,
|
|
479
|
-
show_metric_indicator=show_metric_indicator,
|
|
480
|
-
in_component=False,
|
|
481
|
-
error_config=error_config,
|
|
482
|
-
)
|
|
483
|
-
if res == "skip":
|
|
484
|
-
continue
|
|
485
|
-
|
|
486
|
-
metric_data = create_metric_data(metric)
|
|
487
|
-
api_test_case.update_metric_data(metric_data)
|
|
488
|
-
emitted[current_index] = True
|
|
489
|
-
update_pbar(progress, pbar_test_case_id)
|
|
490
|
-
|
|
491
463
|
# No caching for conversational metrics yet
|
|
492
464
|
elif isinstance(test_case, ConversationalTestCase):
|
|
493
465
|
conversational_test_case_count += 1
|
|
@@ -510,25 +482,20 @@ def execute_test_cases(
|
|
|
510
482
|
|
|
511
483
|
run_sync_with_timeout(_run_case, deadline_timeout)
|
|
512
484
|
except (asyncio.TimeoutError, TimeoutError):
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
)
|
|
518
|
-
for i, m in enumerate(metrics_for_case):
|
|
519
|
-
if getattr(m, "skipped", False):
|
|
485
|
+
|
|
486
|
+
msg = _timeout_msg("evaluating metric", deadline_timeout)
|
|
487
|
+
for i, metric in enumerate(metrics_for_case):
|
|
488
|
+
if metric.skipped:
|
|
520
489
|
continue
|
|
521
490
|
# already finished or errored? leave it
|
|
522
|
-
if
|
|
523
|
-
m, "error", None
|
|
524
|
-
):
|
|
491
|
+
if metric.success is not None or metric.error is not None:
|
|
525
492
|
continue
|
|
526
493
|
if i == current_index:
|
|
527
|
-
|
|
528
|
-
|
|
494
|
+
metric.success = False
|
|
495
|
+
metric.error = msg
|
|
529
496
|
elif i > current_index:
|
|
530
|
-
|
|
531
|
-
|
|
497
|
+
metric.success = False
|
|
498
|
+
metric.error = "Skipped due to case timeout."
|
|
532
499
|
|
|
533
500
|
if not error_config.ignore_errors:
|
|
534
501
|
raise
|
|
@@ -553,12 +520,12 @@ def execute_test_cases(
|
|
|
553
520
|
)
|
|
554
521
|
|
|
555
522
|
# Attach MetricData for *all* metrics (finished or synthesized)
|
|
556
|
-
for i,
|
|
557
|
-
if
|
|
523
|
+
for i, metric in enumerate(metrics_for_case):
|
|
524
|
+
if metric.skipped:
|
|
558
525
|
continue
|
|
559
526
|
if not emitted[i]:
|
|
560
527
|
api_test_case.update_metric_data(
|
|
561
|
-
create_metric_data(
|
|
528
|
+
create_metric_data(metric)
|
|
562
529
|
)
|
|
563
530
|
|
|
564
531
|
elapsed = time.perf_counter() - start_time
|
|
@@ -597,7 +564,6 @@ async def a_execute_test_cases(
|
|
|
597
564
|
metrics: Union[
|
|
598
565
|
List[BaseMetric],
|
|
599
566
|
List[BaseConversationalMetric],
|
|
600
|
-
List[BaseMultimodalMetric],
|
|
601
567
|
],
|
|
602
568
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
603
569
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -612,9 +578,8 @@ async def a_execute_test_cases(
|
|
|
612
578
|
|
|
613
579
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
614
580
|
async with semaphore:
|
|
615
|
-
timeout = _per_task_timeout()
|
|
616
581
|
return await _await_with_outer_deadline(
|
|
617
|
-
func, *args, timeout=
|
|
582
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
618
583
|
)
|
|
619
584
|
|
|
620
585
|
global_test_run_cache_manager.disable_write_cache = (
|
|
@@ -631,20 +596,14 @@ async def a_execute_test_cases(
|
|
|
631
596
|
metric.verbose_mode = display_config.verbose_mode
|
|
632
597
|
|
|
633
598
|
llm_metrics: List[BaseMetric] = []
|
|
634
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
635
599
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
636
600
|
for metric in metrics:
|
|
637
601
|
if isinstance(metric, BaseMetric):
|
|
638
602
|
llm_metrics.append(metric)
|
|
639
|
-
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
640
|
-
mllm_metrics.append(metric)
|
|
641
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
642
|
-
mllm_metrics.append(metric)
|
|
643
603
|
elif isinstance(metric, BaseConversationalMetric):
|
|
644
604
|
conversational_metrics.append(metric)
|
|
645
605
|
|
|
646
606
|
llm_test_case_counter = -1
|
|
647
|
-
mllm_test_case_counter = -1
|
|
648
607
|
conversational_test_case_counter = -1
|
|
649
608
|
test_results: List[Union[TestResult, LLMTestCase]] = []
|
|
650
609
|
tasks = []
|
|
@@ -665,10 +624,7 @@ async def a_execute_test_cases(
|
|
|
665
624
|
with progress:
|
|
666
625
|
for test_case in test_cases:
|
|
667
626
|
with capture_evaluation_run("test case"):
|
|
668
|
-
if (
|
|
669
|
-
isinstance(test_case, LLMTestCase)
|
|
670
|
-
and not test_case.multimodal
|
|
671
|
-
):
|
|
627
|
+
if isinstance(test_case, LLMTestCase):
|
|
672
628
|
if len(llm_metrics) == 0:
|
|
673
629
|
update_pbar(progress, pbar_id)
|
|
674
630
|
continue
|
|
@@ -696,31 +652,6 @@ async def a_execute_test_cases(
|
|
|
696
652
|
)
|
|
697
653
|
tasks.append(asyncio.create_task(task))
|
|
698
654
|
|
|
699
|
-
elif (
|
|
700
|
-
isinstance(test_case, LLMTestCase)
|
|
701
|
-
and test_case.multimodal
|
|
702
|
-
):
|
|
703
|
-
mllm_test_case_counter += 1
|
|
704
|
-
copied_multimodal_metrics: List[
|
|
705
|
-
BaseMultimodalMetric
|
|
706
|
-
] = copy_metrics(mllm_metrics)
|
|
707
|
-
task = execute_with_semaphore(
|
|
708
|
-
func=_a_execute_mllm_test_cases,
|
|
709
|
-
metrics=copied_multimodal_metrics,
|
|
710
|
-
test_case=test_case,
|
|
711
|
-
test_run_manager=test_run_manager,
|
|
712
|
-
test_results=test_results,
|
|
713
|
-
count=mllm_test_case_counter,
|
|
714
|
-
ignore_errors=error_config.ignore_errors,
|
|
715
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
716
|
-
show_indicator=display_config.show_indicator,
|
|
717
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
718
|
-
_is_assert_test=_is_assert_test,
|
|
719
|
-
progress=progress,
|
|
720
|
-
pbar_id=pbar_id,
|
|
721
|
-
)
|
|
722
|
-
tasks.append(asyncio.create_task(task))
|
|
723
|
-
|
|
724
655
|
elif isinstance(test_case, ConversationalTestCase):
|
|
725
656
|
conversational_test_case_counter += 1
|
|
726
657
|
|
|
@@ -746,27 +677,23 @@ async def a_execute_test_cases(
|
|
|
746
677
|
try:
|
|
747
678
|
await asyncio.wait_for(
|
|
748
679
|
asyncio.gather(*tasks),
|
|
749
|
-
timeout=
|
|
680
|
+
timeout=get_gather_timeout(),
|
|
750
681
|
)
|
|
751
|
-
except (asyncio.TimeoutError, TimeoutError):
|
|
682
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
752
683
|
for t in tasks:
|
|
753
684
|
if not t.done():
|
|
754
685
|
t.cancel()
|
|
755
686
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
)
|
|
687
|
+
|
|
688
|
+
_log_gather_timeout(logger, exc=e)
|
|
689
|
+
|
|
760
690
|
if not error_config.ignore_errors:
|
|
761
691
|
raise
|
|
762
692
|
|
|
763
693
|
else:
|
|
764
694
|
for test_case in test_cases:
|
|
765
695
|
with capture_evaluation_run("test case"):
|
|
766
|
-
if (
|
|
767
|
-
isinstance(test_case, LLMTestCase)
|
|
768
|
-
and not test_case.multimodal
|
|
769
|
-
):
|
|
696
|
+
if isinstance(test_case, LLMTestCase):
|
|
770
697
|
if len(llm_metrics) == 0:
|
|
771
698
|
continue
|
|
772
699
|
llm_test_case_counter += 1
|
|
@@ -814,34 +741,12 @@ async def a_execute_test_cases(
|
|
|
814
741
|
)
|
|
815
742
|
tasks.append(asyncio.create_task((task)))
|
|
816
743
|
|
|
817
|
-
elif (
|
|
818
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
819
|
-
):
|
|
820
|
-
mllm_test_case_counter += 1
|
|
821
|
-
copied_multimodal_metrics: List[BaseMultimodalMetric] = (
|
|
822
|
-
copy_metrics(mllm_metrics)
|
|
823
|
-
)
|
|
824
|
-
task = execute_with_semaphore(
|
|
825
|
-
func=_a_execute_mllm_test_cases,
|
|
826
|
-
metrics=copied_multimodal_metrics,
|
|
827
|
-
test_case=test_case,
|
|
828
|
-
test_run_manager=test_run_manager,
|
|
829
|
-
test_results=test_results,
|
|
830
|
-
count=mllm_test_case_counter,
|
|
831
|
-
ignore_errors=error_config.ignore_errors,
|
|
832
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
833
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
834
|
-
_is_assert_test=_is_assert_test,
|
|
835
|
-
show_indicator=display_config.show_indicator,
|
|
836
|
-
)
|
|
837
|
-
tasks.append(asyncio.create_task(task))
|
|
838
|
-
|
|
839
744
|
await asyncio.sleep(async_config.throttle_value)
|
|
840
745
|
|
|
841
746
|
try:
|
|
842
747
|
await asyncio.wait_for(
|
|
843
748
|
asyncio.gather(*tasks),
|
|
844
|
-
timeout=
|
|
749
|
+
timeout=get_gather_timeout(),
|
|
845
750
|
)
|
|
846
751
|
except (asyncio.TimeoutError, TimeoutError):
|
|
847
752
|
# Cancel any still-pending tasks and drain them
|
|
@@ -910,11 +815,18 @@ async def _a_execute_llm_test_cases(
|
|
|
910
815
|
progress=progress,
|
|
911
816
|
)
|
|
912
817
|
except asyncio.CancelledError:
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
818
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
819
|
+
msg = (
|
|
820
|
+
"Cancelled while evaluating metric. "
|
|
821
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
822
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
823
|
+
)
|
|
824
|
+
else:
|
|
825
|
+
msg = (
|
|
826
|
+
"Timed out/cancelled while evaluating metric. "
|
|
827
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
828
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
829
|
+
)
|
|
918
830
|
for m in metrics:
|
|
919
831
|
if getattr(m, "skipped", False):
|
|
920
832
|
continue
|
|
@@ -976,85 +888,8 @@ async def _a_execute_llm_test_cases(
|
|
|
976
888
|
update_pbar(progress, pbar_id)
|
|
977
889
|
|
|
978
890
|
|
|
979
|
-
async def _a_execute_mllm_test_cases(
|
|
980
|
-
metrics: List[BaseMultimodalMetric],
|
|
981
|
-
test_case: LLMTestCase,
|
|
982
|
-
test_run_manager: TestRunManager,
|
|
983
|
-
test_results: List[Union[TestResult, LLMTestCase]],
|
|
984
|
-
count: int,
|
|
985
|
-
ignore_errors: bool,
|
|
986
|
-
skip_on_missing_params: bool,
|
|
987
|
-
show_indicator: bool,
|
|
988
|
-
_use_bar_indicator: bool,
|
|
989
|
-
_is_assert_test: bool,
|
|
990
|
-
progress: Optional[Progress] = None,
|
|
991
|
-
pbar_id: Optional[int] = None,
|
|
992
|
-
):
|
|
993
|
-
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
994
|
-
pbar_test_case_id = add_pbar(
|
|
995
|
-
progress,
|
|
996
|
-
f" 🎯 Evaluating test case #{count}",
|
|
997
|
-
total=len(metrics),
|
|
998
|
-
)
|
|
999
|
-
|
|
1000
|
-
for metric in metrics:
|
|
1001
|
-
metric.skipped = False
|
|
1002
|
-
metric.error = None # Reset metric error
|
|
1003
|
-
|
|
1004
|
-
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
1005
|
-
test_case=test_case, index=count if not _is_assert_test else None
|
|
1006
|
-
)
|
|
1007
|
-
test_start_time = time.perf_counter()
|
|
1008
|
-
try:
|
|
1009
|
-
await measure_metrics_with_indicator(
|
|
1010
|
-
metrics=metrics,
|
|
1011
|
-
test_case=test_case,
|
|
1012
|
-
cached_test_case=None,
|
|
1013
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
1014
|
-
ignore_errors=ignore_errors,
|
|
1015
|
-
show_indicator=show_metrics_indicator,
|
|
1016
|
-
pbar_eval_id=pbar_test_case_id,
|
|
1017
|
-
progress=progress,
|
|
1018
|
-
)
|
|
1019
|
-
except asyncio.CancelledError:
|
|
1020
|
-
msg = (
|
|
1021
|
-
"Timed out/cancelled while evaluating metric. "
|
|
1022
|
-
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1023
|
-
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1024
|
-
)
|
|
1025
|
-
for m in metrics:
|
|
1026
|
-
if getattr(m, "skipped", False):
|
|
1027
|
-
continue
|
|
1028
|
-
# If the task never finished and didn't set a terminal state, mark it now
|
|
1029
|
-
if getattr(m, "success", None) is None and not getattr(
|
|
1030
|
-
m, "error", None
|
|
1031
|
-
):
|
|
1032
|
-
m.success = False
|
|
1033
|
-
m.error = msg
|
|
1034
|
-
if not ignore_errors:
|
|
1035
|
-
raise
|
|
1036
|
-
finally:
|
|
1037
|
-
for metric in metrics:
|
|
1038
|
-
if metric.skipped:
|
|
1039
|
-
continue
|
|
1040
|
-
|
|
1041
|
-
metric_data = create_metric_data(metric)
|
|
1042
|
-
api_test_case.update_metric_data(metric_data)
|
|
1043
|
-
|
|
1044
|
-
test_end_time = time.perf_counter()
|
|
1045
|
-
run_duration = test_end_time - test_start_time
|
|
1046
|
-
api_test_case.update_run_duration(run_duration)
|
|
1047
|
-
|
|
1048
|
-
### Update Test Run ###
|
|
1049
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1050
|
-
test_results.append(create_test_result(api_test_case))
|
|
1051
|
-
update_pbar(progress, pbar_id)
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
891
|
async def _a_execute_conversational_test_cases(
|
|
1055
|
-
metrics: List[
|
|
1056
|
-
Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
|
|
1057
|
-
],
|
|
892
|
+
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
|
|
1058
893
|
test_case: ConversationalTestCase,
|
|
1059
894
|
test_run_manager: TestRunManager,
|
|
1060
895
|
test_results: List[Union[TestResult, LLMTestCase]],
|
|
@@ -1097,11 +932,18 @@ async def _a_execute_conversational_test_cases(
|
|
|
1097
932
|
)
|
|
1098
933
|
|
|
1099
934
|
except asyncio.CancelledError:
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
935
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
936
|
+
msg = (
|
|
937
|
+
"Cancelled while evaluating metric. "
|
|
938
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
939
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
msg = (
|
|
943
|
+
"Timed out/cancelled while evaluating metric. "
|
|
944
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
945
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
946
|
+
)
|
|
1105
947
|
for m in metrics:
|
|
1106
948
|
if getattr(m, "skipped", False):
|
|
1107
949
|
continue
|
|
@@ -1211,7 +1053,7 @@ def execute_agentic_test_cases(
|
|
|
1211
1053
|
loop.run_until_complete(
|
|
1212
1054
|
_await_with_outer_deadline(
|
|
1213
1055
|
coro,
|
|
1214
|
-
timeout=
|
|
1056
|
+
timeout=get_per_task_timeout_seconds(),
|
|
1215
1057
|
)
|
|
1216
1058
|
)
|
|
1217
1059
|
else:
|
|
@@ -1538,17 +1380,13 @@ def execute_agentic_test_cases(
|
|
|
1538
1380
|
|
|
1539
1381
|
# run the golden with a timeout
|
|
1540
1382
|
start_time = time.perf_counter()
|
|
1541
|
-
deadline =
|
|
1383
|
+
deadline = get_per_task_timeout_seconds()
|
|
1542
1384
|
|
|
1543
1385
|
try:
|
|
1544
1386
|
run_sync_with_timeout(_run_golden, deadline)
|
|
1545
1387
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1546
1388
|
# mark any not yet finished trace level and span level metrics as timed out.
|
|
1547
|
-
msg = (
|
|
1548
|
-
f"Timed out after {deadline:.2f}s while executing agentic test case. "
|
|
1549
|
-
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1550
|
-
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1551
|
-
)
|
|
1389
|
+
msg = _timeout_msg("executing agentic test case", deadline)
|
|
1552
1390
|
|
|
1553
1391
|
if current_trace is not None:
|
|
1554
1392
|
# Trace-level metrics
|
|
@@ -1729,9 +1567,8 @@ async def a_execute_agentic_test_cases(
|
|
|
1729
1567
|
|
|
1730
1568
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1731
1569
|
async with semaphore:
|
|
1732
|
-
timeout = _per_task_timeout()
|
|
1733
1570
|
return await _await_with_outer_deadline(
|
|
1734
|
-
func, *args, timeout=
|
|
1571
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
1735
1572
|
)
|
|
1736
1573
|
|
|
1737
1574
|
test_run_manager = global_test_run_manager
|
|
@@ -1782,7 +1619,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1782
1619
|
try:
|
|
1783
1620
|
await asyncio.wait_for(
|
|
1784
1621
|
asyncio.gather(*tasks),
|
|
1785
|
-
timeout=
|
|
1622
|
+
timeout=get_gather_timeout(),
|
|
1786
1623
|
)
|
|
1787
1624
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1788
1625
|
# Cancel any still-pending tasks and drain them
|
|
@@ -1863,7 +1700,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1863
1700
|
await _await_with_outer_deadline(
|
|
1864
1701
|
observed_callback,
|
|
1865
1702
|
golden.input,
|
|
1866
|
-
timeout=
|
|
1703
|
+
timeout=get_per_task_timeout_seconds(),
|
|
1867
1704
|
)
|
|
1868
1705
|
else:
|
|
1869
1706
|
observed_callback(golden.input)
|
|
@@ -1957,7 +1794,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1957
1794
|
try:
|
|
1958
1795
|
await asyncio.wait_for(
|
|
1959
1796
|
asyncio.gather(*child_tasks),
|
|
1960
|
-
timeout=
|
|
1797
|
+
timeout=get_gather_timeout(),
|
|
1961
1798
|
)
|
|
1962
1799
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1963
1800
|
for t in child_tasks:
|
|
@@ -1980,11 +1817,18 @@ async def _a_execute_agentic_test_case(
|
|
|
1980
1817
|
)
|
|
1981
1818
|
except asyncio.CancelledError:
|
|
1982
1819
|
# mark any unfinished metrics as cancelled
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1820
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
1821
|
+
cancel_msg = (
|
|
1822
|
+
"Cancelled while evaluating agentic test case. "
|
|
1823
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
1824
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1825
|
+
)
|
|
1826
|
+
else:
|
|
1827
|
+
cancel_msg = (
|
|
1828
|
+
"Timed out/cancelled while evaluating agentic test case. "
|
|
1829
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1830
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1831
|
+
)
|
|
1988
1832
|
|
|
1989
1833
|
if trace_metrics:
|
|
1990
1834
|
for m in trace_metrics:
|
|
@@ -2676,8 +2520,9 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2676
2520
|
|
|
2677
2521
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
2678
2522
|
async with semaphore:
|
|
2679
|
-
|
|
2680
|
-
|
|
2523
|
+
return await _await_with_outer_deadline(
|
|
2524
|
+
coroutine, timeout=get_per_task_timeout_seconds()
|
|
2525
|
+
)
|
|
2681
2526
|
|
|
2682
2527
|
def evaluate_test_cases(
|
|
2683
2528
|
progress: Optional[Progress] = None,
|
|
@@ -2899,15 +2744,18 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2899
2744
|
loop.run_until_complete(
|
|
2900
2745
|
asyncio.wait_for(
|
|
2901
2746
|
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
2902
|
-
timeout=
|
|
2747
|
+
timeout=get_gather_timeout(),
|
|
2903
2748
|
)
|
|
2904
2749
|
)
|
|
2905
2750
|
|
|
2906
|
-
except (asyncio.TimeoutError, TimeoutError):
|
|
2751
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
2907
2752
|
import traceback
|
|
2908
2753
|
|
|
2754
|
+
settings = get_settings()
|
|
2909
2755
|
pending = [t for t in created_tasks if not t.done()]
|
|
2910
2756
|
|
|
2757
|
+
_log_gather_timeout(logger, exc=e, pending=len(pending))
|
|
2758
|
+
|
|
2911
2759
|
# Log the elapsed time for each task that was pending
|
|
2912
2760
|
for t in pending:
|
|
2913
2761
|
meta = task_meta.get(t, {})
|
|
@@ -2915,26 +2763,27 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2915
2763
|
elapsed_time = time.perf_counter() - start_time
|
|
2916
2764
|
|
|
2917
2765
|
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
2918
|
-
if
|
|
2919
|
-
timeout_type =
|
|
2766
|
+
if not settings.DEEPEVAL_DISABLE_TIMEOUTS:
|
|
2767
|
+
timeout_type = (
|
|
2768
|
+
"per-task"
|
|
2769
|
+
if elapsed_time >= get_per_task_timeout_seconds()
|
|
2770
|
+
else "gather"
|
|
2771
|
+
)
|
|
2772
|
+
logger.info(
|
|
2773
|
+
" - PENDING %s elapsed_time=%.2fs timeout_type=%s meta=%s",
|
|
2774
|
+
t.get_name(),
|
|
2775
|
+
elapsed_time,
|
|
2776
|
+
timeout_type,
|
|
2777
|
+
meta,
|
|
2778
|
+
)
|
|
2920
2779
|
else:
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
2928
|
-
f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
|
|
2929
|
-
)
|
|
2780
|
+
logger.info(
|
|
2781
|
+
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
2782
|
+
t.get_name(),
|
|
2783
|
+
elapsed_time,
|
|
2784
|
+
meta,
|
|
2785
|
+
)
|
|
2930
2786
|
|
|
2931
|
-
# Log pending tasks and their stack traces
|
|
2932
|
-
logger.info(
|
|
2933
|
-
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
2934
|
-
t.get_name(),
|
|
2935
|
-
elapsed_time,
|
|
2936
|
-
meta,
|
|
2937
|
-
)
|
|
2938
2787
|
if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
2939
2788
|
frames = t.get_stack(limit=6)
|
|
2940
2789
|
if frames:
|
|
@@ -3116,9 +2965,8 @@ async def _a_evaluate_traces(
|
|
|
3116
2965
|
|
|
3117
2966
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
3118
2967
|
async with semaphore:
|
|
3119
|
-
timeout = _per_task_timeout()
|
|
3120
2968
|
return await _await_with_outer_deadline(
|
|
3121
|
-
func, *args, timeout=
|
|
2969
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
3122
2970
|
)
|
|
3123
2971
|
|
|
3124
2972
|
eval_tasks = []
|
|
@@ -3166,7 +3014,7 @@ async def _a_evaluate_traces(
|
|
|
3166
3014
|
try:
|
|
3167
3015
|
await asyncio.wait_for(
|
|
3168
3016
|
asyncio.gather(*eval_tasks),
|
|
3169
|
-
timeout=
|
|
3017
|
+
timeout=get_gather_timeout(),
|
|
3170
3018
|
)
|
|
3171
3019
|
except (asyncio.TimeoutError, TimeoutError):
|
|
3172
3020
|
for t in eval_tasks:
|
|
@@ -3196,9 +3044,8 @@ async def _evaluate_test_case_pairs(
|
|
|
3196
3044
|
|
|
3197
3045
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
3198
3046
|
async with semaphore:
|
|
3199
|
-
timeout = _per_task_timeout()
|
|
3200
3047
|
return await _await_with_outer_deadline(
|
|
3201
|
-
func, *args, timeout=
|
|
3048
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
3202
3049
|
)
|
|
3203
3050
|
|
|
3204
3051
|
tasks = []
|
|
@@ -3236,7 +3083,7 @@ async def _evaluate_test_case_pairs(
|
|
|
3236
3083
|
try:
|
|
3237
3084
|
await asyncio.wait_for(
|
|
3238
3085
|
asyncio.gather(*tasks),
|
|
3239
|
-
timeout=
|
|
3086
|
+
timeout=get_gather_timeout(),
|
|
3240
3087
|
)
|
|
3241
3088
|
except (asyncio.TimeoutError, TimeoutError):
|
|
3242
3089
|
# Cancel any still-pending tasks and drain them
|