deepeval 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +658 -262
- deepeval/config/utils.py +9 -1
- deepeval/dataset/test_run_tracer.py +4 -6
- deepeval/evaluate/execute.py +153 -94
- deepeval/integrations/pydantic_ai/instrumentator.py +4 -2
- deepeval/integrations/pydantic_ai/otel.py +5 -1
- deepeval/key_handler.py +121 -51
- deepeval/metrics/base_metric.py +9 -3
- deepeval/metrics/g_eval/g_eval.py +6 -1
- deepeval/metrics/indicator.py +8 -4
- deepeval/metrics/mcp/mcp_task_completion.py +15 -16
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +8 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +30 -28
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +8 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
- deepeval/metrics/turn_contextual_precision/template.py +8 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
- deepeval/metrics/utils.py +16 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +5 -4
- deepeval/models/llms/anthropic_model.py +4 -3
- deepeval/models/llms/azure_model.py +4 -3
- deepeval/models/llms/deepseek_model.py +5 -8
- deepeval/models/llms/grok_model.py +5 -8
- deepeval/models/llms/kimi_model.py +5 -8
- deepeval/models/llms/litellm_model.py +2 -0
- deepeval/models/llms/local_model.py +1 -1
- deepeval/models/llms/openai_model.py +4 -3
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +1 -5
- deepeval/simulator/conversation_simulator.py +6 -2
- deepeval/simulator/template.py +3 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/METADATA +3 -3
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/RECORD +57 -56
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/WHEEL +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/entry_points.txt +0 -0
deepeval/config/utils.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
-
|
|
4
|
+
from dotenv import dotenv_values
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Any, Iterable, List, Optional
|
|
6
7
|
|
|
7
8
|
|
|
@@ -142,3 +143,10 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
|
142
143
|
def constrain_between(value: float, lo: float, hi: float) -> float:
|
|
143
144
|
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
144
145
|
return min(max(value, lo), hi)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def read_dotenv_file(path: Path) -> dict[str, str]:
|
|
149
|
+
if not path.exists():
|
|
150
|
+
return {}
|
|
151
|
+
values = dotenv_values(path)
|
|
152
|
+
return {key: value for key, value in values.items() if value is not None}
|
|
@@ -5,6 +5,7 @@ from opentelemetry.trace import Tracer as OTelTracer
|
|
|
5
5
|
from opentelemetry.sdk.trace import SpanProcessor
|
|
6
6
|
from opentelemetry.sdk.trace import TracerProvider
|
|
7
7
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
8
|
+
from deepeval.config.settings import get_settings
|
|
8
9
|
|
|
9
10
|
try:
|
|
10
11
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
@@ -26,11 +27,8 @@ def is_opentelemetry_available():
|
|
|
26
27
|
|
|
27
28
|
from deepeval.confident.api import get_confident_api_key
|
|
28
29
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
32
|
-
else "https://otel.confident-ai.com"
|
|
33
|
-
)
|
|
30
|
+
settings = get_settings()
|
|
31
|
+
OTLP_ENDPOINT = str(settings.CONFIDENT_OTEL_URL)
|
|
34
32
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318"
|
|
35
33
|
|
|
36
34
|
# Module-level globals to be imported and used by other code
|
|
@@ -67,7 +65,7 @@ def init_global_test_run_tracer(api_key: Optional[str] = None):
|
|
|
67
65
|
|
|
68
66
|
provider = TracerProvider()
|
|
69
67
|
exporter = OTLPSpanExporter(
|
|
70
|
-
endpoint=f"{OTLP_ENDPOINT}
|
|
68
|
+
endpoint=f"{OTLP_ENDPOINT}v1/traces",
|
|
71
69
|
headers={"x-confident-api-key": api_key},
|
|
72
70
|
)
|
|
73
71
|
provider.add_span_processor(RunIdSpanProcessor())
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -51,6 +51,10 @@ from deepeval.utils import (
|
|
|
51
51
|
shorten,
|
|
52
52
|
len_medium,
|
|
53
53
|
format_error_text,
|
|
54
|
+
are_timeouts_disabled,
|
|
55
|
+
get_per_task_timeout_seconds,
|
|
56
|
+
get_gather_timeout_seconds,
|
|
57
|
+
get_gather_timeout,
|
|
54
58
|
)
|
|
55
59
|
from deepeval.telemetry import capture_evaluation_run
|
|
56
60
|
from deepeval.metrics import (
|
|
@@ -109,6 +113,57 @@ from deepeval.test_run.hyperparameters import (
|
|
|
109
113
|
logger = logging.getLogger(__name__)
|
|
110
114
|
|
|
111
115
|
|
|
116
|
+
def _timeout_msg(action: str, seconds: float) -> str:
|
|
117
|
+
if are_timeouts_disabled():
|
|
118
|
+
return (
|
|
119
|
+
f"Timeout occurred while {action} "
|
|
120
|
+
"(DeepEval timeouts are disabled; this likely came from the model/provider SDK or network layer). "
|
|
121
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
122
|
+
)
|
|
123
|
+
return (
|
|
124
|
+
f"Timed out after {seconds:.2f}s while {action}. "
|
|
125
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
126
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _log_gather_timeout(
|
|
131
|
+
logger,
|
|
132
|
+
*,
|
|
133
|
+
exc: Optional[BaseException] = None,
|
|
134
|
+
pending: Optional[int] = None,
|
|
135
|
+
) -> None:
|
|
136
|
+
settings = get_settings()
|
|
137
|
+
if are_timeouts_disabled():
|
|
138
|
+
logger.warning(
|
|
139
|
+
"A task raised %s while waiting for gathered results; DeepEval gather/per-task timeouts are disabled%s. "
|
|
140
|
+
"This likely came from the model/provider SDK or network layer.",
|
|
141
|
+
type(exc).__name__ if exc else "TimeoutError",
|
|
142
|
+
f" (pending={pending})" if pending is not None else "",
|
|
143
|
+
exc_info=settings.DEEPEVAL_LOG_STACK_TRACES,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
if pending is not None:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"Gather TIMEOUT after %.1fs; pending=%d tasks. "
|
|
149
|
+
"Some metrics may be marked as timed out. "
|
|
150
|
+
"To give tasks more time, consider increasing "
|
|
151
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
|
|
152
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
|
|
153
|
+
get_gather_timeout_seconds(),
|
|
154
|
+
pending,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
else:
|
|
158
|
+
logger.warning(
|
|
159
|
+
"gather TIMEOUT after %.1fs. Some metrics may be marked as timed out. "
|
|
160
|
+
"To give tasks more time, consider increasing "
|
|
161
|
+
"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or "
|
|
162
|
+
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE.",
|
|
163
|
+
get_gather_timeout_seconds(),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
112
167
|
def _skip_metrics_for_error(
|
|
113
168
|
span: Optional[BaseSpan] = None,
|
|
114
169
|
trace: Optional[Trace] = None,
|
|
@@ -217,18 +272,6 @@ async def _snapshot_tasks():
|
|
|
217
272
|
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
218
273
|
|
|
219
274
|
|
|
220
|
-
def _per_task_timeout() -> float:
|
|
221
|
-
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def _gather_timeout() -> float:
|
|
225
|
-
s = get_settings()
|
|
226
|
-
return (
|
|
227
|
-
s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
228
|
-
+ s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
|
|
232
275
|
def filter_duplicate_results(
|
|
233
276
|
main_result: TestResult, results: List[TestResult]
|
|
234
277
|
) -> List[TestResult]:
|
|
@@ -250,6 +293,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
|
250
293
|
coro = obj
|
|
251
294
|
else:
|
|
252
295
|
coro = obj(*args, **kwargs)
|
|
296
|
+
|
|
297
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
298
|
+
return await coro
|
|
299
|
+
|
|
253
300
|
return await asyncio.wait_for(coro, timeout=timeout)
|
|
254
301
|
finally:
|
|
255
302
|
reset_outer_deadline(token)
|
|
@@ -350,7 +397,7 @@ def execute_test_cases(
|
|
|
350
397
|
index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
|
|
351
398
|
current_index = -1
|
|
352
399
|
start_time = time.perf_counter()
|
|
353
|
-
deadline_timeout =
|
|
400
|
+
deadline_timeout = get_per_task_timeout_seconds()
|
|
354
401
|
deadline_token = set_outer_deadline(deadline_timeout)
|
|
355
402
|
new_cached_test_case: CachedTestCase = None
|
|
356
403
|
try:
|
|
@@ -435,25 +482,20 @@ def execute_test_cases(
|
|
|
435
482
|
|
|
436
483
|
run_sync_with_timeout(_run_case, deadline_timeout)
|
|
437
484
|
except (asyncio.TimeoutError, TimeoutError):
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
)
|
|
443
|
-
for i, m in enumerate(metrics_for_case):
|
|
444
|
-
if getattr(m, "skipped", False):
|
|
485
|
+
|
|
486
|
+
msg = _timeout_msg("evaluating metric", deadline_timeout)
|
|
487
|
+
for i, metric in enumerate(metrics_for_case):
|
|
488
|
+
if metric.skipped:
|
|
445
489
|
continue
|
|
446
490
|
# already finished or errored? leave it
|
|
447
|
-
if
|
|
448
|
-
m, "error", None
|
|
449
|
-
):
|
|
491
|
+
if metric.success is not None or metric.error is not None:
|
|
450
492
|
continue
|
|
451
493
|
if i == current_index:
|
|
452
|
-
|
|
453
|
-
|
|
494
|
+
metric.success = False
|
|
495
|
+
metric.error = msg
|
|
454
496
|
elif i > current_index:
|
|
455
|
-
|
|
456
|
-
|
|
497
|
+
metric.success = False
|
|
498
|
+
metric.error = "Skipped due to case timeout."
|
|
457
499
|
|
|
458
500
|
if not error_config.ignore_errors:
|
|
459
501
|
raise
|
|
@@ -478,12 +520,12 @@ def execute_test_cases(
|
|
|
478
520
|
)
|
|
479
521
|
|
|
480
522
|
# Attach MetricData for *all* metrics (finished or synthesized)
|
|
481
|
-
for i,
|
|
482
|
-
if
|
|
523
|
+
for i, metric in enumerate(metrics_for_case):
|
|
524
|
+
if metric.skipped:
|
|
483
525
|
continue
|
|
484
526
|
if not emitted[i]:
|
|
485
527
|
api_test_case.update_metric_data(
|
|
486
|
-
create_metric_data(
|
|
528
|
+
create_metric_data(metric)
|
|
487
529
|
)
|
|
488
530
|
|
|
489
531
|
elapsed = time.perf_counter() - start_time
|
|
@@ -536,9 +578,8 @@ async def a_execute_test_cases(
|
|
|
536
578
|
|
|
537
579
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
538
580
|
async with semaphore:
|
|
539
|
-
timeout = _per_task_timeout()
|
|
540
581
|
return await _await_with_outer_deadline(
|
|
541
|
-
func, *args, timeout=
|
|
582
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
542
583
|
)
|
|
543
584
|
|
|
544
585
|
global_test_run_cache_manager.disable_write_cache = (
|
|
@@ -636,17 +677,16 @@ async def a_execute_test_cases(
|
|
|
636
677
|
try:
|
|
637
678
|
await asyncio.wait_for(
|
|
638
679
|
asyncio.gather(*tasks),
|
|
639
|
-
timeout=
|
|
680
|
+
timeout=get_gather_timeout(),
|
|
640
681
|
)
|
|
641
|
-
except (asyncio.TimeoutError, TimeoutError):
|
|
682
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
642
683
|
for t in tasks:
|
|
643
684
|
if not t.done():
|
|
644
685
|
t.cancel()
|
|
645
686
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
)
|
|
687
|
+
|
|
688
|
+
_log_gather_timeout(logger, exc=e)
|
|
689
|
+
|
|
650
690
|
if not error_config.ignore_errors:
|
|
651
691
|
raise
|
|
652
692
|
|
|
@@ -706,7 +746,7 @@ async def a_execute_test_cases(
|
|
|
706
746
|
try:
|
|
707
747
|
await asyncio.wait_for(
|
|
708
748
|
asyncio.gather(*tasks),
|
|
709
|
-
timeout=
|
|
749
|
+
timeout=get_gather_timeout(),
|
|
710
750
|
)
|
|
711
751
|
except (asyncio.TimeoutError, TimeoutError):
|
|
712
752
|
# Cancel any still-pending tasks and drain them
|
|
@@ -775,11 +815,18 @@ async def _a_execute_llm_test_cases(
|
|
|
775
815
|
progress=progress,
|
|
776
816
|
)
|
|
777
817
|
except asyncio.CancelledError:
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
818
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
819
|
+
msg = (
|
|
820
|
+
"Cancelled while evaluating metric. "
|
|
821
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
822
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
823
|
+
)
|
|
824
|
+
else:
|
|
825
|
+
msg = (
|
|
826
|
+
"Timed out/cancelled while evaluating metric. "
|
|
827
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
828
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
829
|
+
)
|
|
783
830
|
for m in metrics:
|
|
784
831
|
if getattr(m, "skipped", False):
|
|
785
832
|
continue
|
|
@@ -885,11 +932,18 @@ async def _a_execute_conversational_test_cases(
|
|
|
885
932
|
)
|
|
886
933
|
|
|
887
934
|
except asyncio.CancelledError:
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
935
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
936
|
+
msg = (
|
|
937
|
+
"Cancelled while evaluating metric. "
|
|
938
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
939
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
msg = (
|
|
943
|
+
"Timed out/cancelled while evaluating metric. "
|
|
944
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
945
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
946
|
+
)
|
|
893
947
|
for m in metrics:
|
|
894
948
|
if getattr(m, "skipped", False):
|
|
895
949
|
continue
|
|
@@ -999,7 +1053,7 @@ def execute_agentic_test_cases(
|
|
|
999
1053
|
loop.run_until_complete(
|
|
1000
1054
|
_await_with_outer_deadline(
|
|
1001
1055
|
coro,
|
|
1002
|
-
timeout=
|
|
1056
|
+
timeout=get_per_task_timeout_seconds(),
|
|
1003
1057
|
)
|
|
1004
1058
|
)
|
|
1005
1059
|
else:
|
|
@@ -1326,17 +1380,13 @@ def execute_agentic_test_cases(
|
|
|
1326
1380
|
|
|
1327
1381
|
# run the golden with a timeout
|
|
1328
1382
|
start_time = time.perf_counter()
|
|
1329
|
-
deadline =
|
|
1383
|
+
deadline = get_per_task_timeout_seconds()
|
|
1330
1384
|
|
|
1331
1385
|
try:
|
|
1332
1386
|
run_sync_with_timeout(_run_golden, deadline)
|
|
1333
1387
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1334
1388
|
# mark any not yet finished trace level and span level metrics as timed out.
|
|
1335
|
-
msg = (
|
|
1336
|
-
f"Timed out after {deadline:.2f}s while executing agentic test case. "
|
|
1337
|
-
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1338
|
-
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1339
|
-
)
|
|
1389
|
+
msg = _timeout_msg("executing agentic test case", deadline)
|
|
1340
1390
|
|
|
1341
1391
|
if current_trace is not None:
|
|
1342
1392
|
# Trace-level metrics
|
|
@@ -1517,9 +1567,8 @@ async def a_execute_agentic_test_cases(
|
|
|
1517
1567
|
|
|
1518
1568
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1519
1569
|
async with semaphore:
|
|
1520
|
-
timeout = _per_task_timeout()
|
|
1521
1570
|
return await _await_with_outer_deadline(
|
|
1522
|
-
func, *args, timeout=
|
|
1571
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
1523
1572
|
)
|
|
1524
1573
|
|
|
1525
1574
|
test_run_manager = global_test_run_manager
|
|
@@ -1570,7 +1619,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1570
1619
|
try:
|
|
1571
1620
|
await asyncio.wait_for(
|
|
1572
1621
|
asyncio.gather(*tasks),
|
|
1573
|
-
timeout=
|
|
1622
|
+
timeout=get_gather_timeout(),
|
|
1574
1623
|
)
|
|
1575
1624
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1576
1625
|
# Cancel any still-pending tasks and drain them
|
|
@@ -1651,7 +1700,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1651
1700
|
await _await_with_outer_deadline(
|
|
1652
1701
|
observed_callback,
|
|
1653
1702
|
golden.input,
|
|
1654
|
-
timeout=
|
|
1703
|
+
timeout=get_per_task_timeout_seconds(),
|
|
1655
1704
|
)
|
|
1656
1705
|
else:
|
|
1657
1706
|
observed_callback(golden.input)
|
|
@@ -1745,7 +1794,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1745
1794
|
try:
|
|
1746
1795
|
await asyncio.wait_for(
|
|
1747
1796
|
asyncio.gather(*child_tasks),
|
|
1748
|
-
timeout=
|
|
1797
|
+
timeout=get_gather_timeout(),
|
|
1749
1798
|
)
|
|
1750
1799
|
except (asyncio.TimeoutError, TimeoutError):
|
|
1751
1800
|
for t in child_tasks:
|
|
@@ -1768,11 +1817,18 @@ async def _a_execute_agentic_test_case(
|
|
|
1768
1817
|
)
|
|
1769
1818
|
except asyncio.CancelledError:
|
|
1770
1819
|
# mark any unfinished metrics as cancelled
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1820
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
1821
|
+
cancel_msg = (
|
|
1822
|
+
"Cancelled while evaluating agentic test case. "
|
|
1823
|
+
"(DeepEval timeouts are disabled; this cancellation likely came from upstream orchestration or manual cancellation). "
|
|
1824
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1825
|
+
)
|
|
1826
|
+
else:
|
|
1827
|
+
cancel_msg = (
|
|
1828
|
+
"Timed out/cancelled while evaluating agentic test case. "
|
|
1829
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1830
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1831
|
+
)
|
|
1776
1832
|
|
|
1777
1833
|
if trace_metrics:
|
|
1778
1834
|
for m in trace_metrics:
|
|
@@ -2464,8 +2520,9 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2464
2520
|
|
|
2465
2521
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
2466
2522
|
async with semaphore:
|
|
2467
|
-
|
|
2468
|
-
|
|
2523
|
+
return await _await_with_outer_deadline(
|
|
2524
|
+
coroutine, timeout=get_per_task_timeout_seconds()
|
|
2525
|
+
)
|
|
2469
2526
|
|
|
2470
2527
|
def evaluate_test_cases(
|
|
2471
2528
|
progress: Optional[Progress] = None,
|
|
@@ -2687,15 +2744,18 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2687
2744
|
loop.run_until_complete(
|
|
2688
2745
|
asyncio.wait_for(
|
|
2689
2746
|
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
2690
|
-
timeout=
|
|
2747
|
+
timeout=get_gather_timeout(),
|
|
2691
2748
|
)
|
|
2692
2749
|
)
|
|
2693
2750
|
|
|
2694
|
-
except (asyncio.TimeoutError, TimeoutError):
|
|
2751
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
2695
2752
|
import traceback
|
|
2696
2753
|
|
|
2754
|
+
settings = get_settings()
|
|
2697
2755
|
pending = [t for t in created_tasks if not t.done()]
|
|
2698
2756
|
|
|
2757
|
+
_log_gather_timeout(logger, exc=e, pending=len(pending))
|
|
2758
|
+
|
|
2699
2759
|
# Log the elapsed time for each task that was pending
|
|
2700
2760
|
for t in pending:
|
|
2701
2761
|
meta = task_meta.get(t, {})
|
|
@@ -2703,26 +2763,27 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2703
2763
|
elapsed_time = time.perf_counter() - start_time
|
|
2704
2764
|
|
|
2705
2765
|
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
2706
|
-
if
|
|
2707
|
-
timeout_type =
|
|
2766
|
+
if not settings.DEEPEVAL_DISABLE_TIMEOUTS:
|
|
2767
|
+
timeout_type = (
|
|
2768
|
+
"per-task"
|
|
2769
|
+
if elapsed_time >= get_per_task_timeout_seconds()
|
|
2770
|
+
else "gather"
|
|
2771
|
+
)
|
|
2772
|
+
logger.info(
|
|
2773
|
+
" - PENDING %s elapsed_time=%.2fs timeout_type=%s meta=%s",
|
|
2774
|
+
t.get_name(),
|
|
2775
|
+
elapsed_time,
|
|
2776
|
+
timeout_type,
|
|
2777
|
+
meta,
|
|
2778
|
+
)
|
|
2708
2779
|
else:
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
2716
|
-
f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
|
|
2717
|
-
)
|
|
2780
|
+
logger.info(
|
|
2781
|
+
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
2782
|
+
t.get_name(),
|
|
2783
|
+
elapsed_time,
|
|
2784
|
+
meta,
|
|
2785
|
+
)
|
|
2718
2786
|
|
|
2719
|
-
# Log pending tasks and their stack traces
|
|
2720
|
-
logger.info(
|
|
2721
|
-
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
2722
|
-
t.get_name(),
|
|
2723
|
-
elapsed_time,
|
|
2724
|
-
meta,
|
|
2725
|
-
)
|
|
2726
2787
|
if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
2727
2788
|
frames = t.get_stack(limit=6)
|
|
2728
2789
|
if frames:
|
|
@@ -2904,9 +2965,8 @@ async def _a_evaluate_traces(
|
|
|
2904
2965
|
|
|
2905
2966
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
2906
2967
|
async with semaphore:
|
|
2907
|
-
timeout = _per_task_timeout()
|
|
2908
2968
|
return await _await_with_outer_deadline(
|
|
2909
|
-
func, *args, timeout=
|
|
2969
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
2910
2970
|
)
|
|
2911
2971
|
|
|
2912
2972
|
eval_tasks = []
|
|
@@ -2954,7 +3014,7 @@ async def _a_evaluate_traces(
|
|
|
2954
3014
|
try:
|
|
2955
3015
|
await asyncio.wait_for(
|
|
2956
3016
|
asyncio.gather(*eval_tasks),
|
|
2957
|
-
timeout=
|
|
3017
|
+
timeout=get_gather_timeout(),
|
|
2958
3018
|
)
|
|
2959
3019
|
except (asyncio.TimeoutError, TimeoutError):
|
|
2960
3020
|
for t in eval_tasks:
|
|
@@ -2984,9 +3044,8 @@ async def _evaluate_test_case_pairs(
|
|
|
2984
3044
|
|
|
2985
3045
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
2986
3046
|
async with semaphore:
|
|
2987
|
-
timeout = _per_task_timeout()
|
|
2988
3047
|
return await _await_with_outer_deadline(
|
|
2989
|
-
func, *args, timeout=
|
|
3048
|
+
func, *args, timeout=get_per_task_timeout_seconds(), **kwargs
|
|
2990
3049
|
)
|
|
2991
3050
|
|
|
2992
3051
|
tasks = []
|
|
@@ -3024,7 +3083,7 @@ async def _evaluate_test_case_pairs(
|
|
|
3024
3083
|
try:
|
|
3025
3084
|
await asyncio.wait_for(
|
|
3026
3085
|
asyncio.gather(*tasks),
|
|
3027
|
-
timeout=
|
|
3086
|
+
timeout=get_gather_timeout(),
|
|
3028
3087
|
)
|
|
3029
3088
|
except (asyncio.TimeoutError, TimeoutError):
|
|
3030
3089
|
# Cancel any still-pending tasks and drain them
|
|
@@ -27,6 +27,7 @@ from deepeval.tracing.types import (
|
|
|
27
27
|
)
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
|
+
settings = get_settings()
|
|
30
31
|
|
|
31
32
|
try:
|
|
32
33
|
# Optional dependencies
|
|
@@ -48,7 +49,7 @@ except ImportError as e:
|
|
|
48
49
|
dependency_installed = False
|
|
49
50
|
|
|
50
51
|
# Preserve previous behavior: only log when verbose mode is enabled.
|
|
51
|
-
if
|
|
52
|
+
if settings.DEEPEVAL_VERBOSE_MODE:
|
|
52
53
|
if isinstance(e, ModuleNotFoundError):
|
|
53
54
|
logger.warning(
|
|
54
55
|
"Optional tracing dependency not installed: %s",
|
|
@@ -104,7 +105,8 @@ else:
|
|
|
104
105
|
ReadableSpan = _ReadableSpan
|
|
105
106
|
|
|
106
107
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
107
|
-
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
108
|
+
# OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
109
|
+
OTLP_ENDPOINT = str(settings.CONFIDENT_OTEL_URL) + "v1/traces"
|
|
108
110
|
init_clock_bridge() # initialize clock bridge for perf_counter() to epoch_nanos conversion
|
|
109
111
|
|
|
110
112
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from typing import Optional
|
|
3
3
|
from deepeval.telemetry import capture_tracing_integration
|
|
4
|
+
from deepeval.config.settings import get_settings
|
|
4
5
|
|
|
5
6
|
try:
|
|
6
7
|
from opentelemetry import trace
|
|
@@ -23,7 +24,10 @@ def is_opentelemetry_available():
|
|
|
23
24
|
return True
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
|
|
27
|
+
settings = get_settings()
|
|
28
|
+
# OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
29
|
+
|
|
30
|
+
OTLP_ENDPOINT = str(settings.CONFIDENT_OTEL_URL) + "v1/traces"
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
def instrument_pydantic_ai(api_key: Optional[str] = None):
|