deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Union
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.test_case import (
|
|
5
6
|
LLMTestCase,
|
|
6
7
|
LLMTestCaseParams,
|
|
@@ -73,6 +74,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
73
74
|
test_case: LLMTestCase,
|
|
74
75
|
_show_indicator: bool = True,
|
|
75
76
|
_in_component: bool = False,
|
|
77
|
+
_log_metric_to_confident: bool = True,
|
|
76
78
|
) -> float:
|
|
77
79
|
|
|
78
80
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -88,6 +90,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
88
90
|
test_case,
|
|
89
91
|
_show_indicator=False,
|
|
90
92
|
_in_component=_in_component,
|
|
93
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
91
94
|
)
|
|
92
95
|
)
|
|
93
96
|
else:
|
|
@@ -121,7 +124,10 @@ class SummarizationMetric(BaseMetric):
|
|
|
121
124
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
125
|
],
|
|
123
126
|
)
|
|
124
|
-
|
|
127
|
+
if _log_metric_to_confident:
|
|
128
|
+
metric_data_manager.post_metric_if_enabled(
|
|
129
|
+
self, test_case=test_case
|
|
130
|
+
)
|
|
125
131
|
return self.score
|
|
126
132
|
|
|
127
133
|
async def a_measure(
|
|
@@ -129,6 +135,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
129
135
|
test_case: LLMTestCase,
|
|
130
136
|
_show_indicator: bool = True,
|
|
131
137
|
_in_component: bool = False,
|
|
138
|
+
_log_metric_to_confident: bool = True,
|
|
132
139
|
) -> float:
|
|
133
140
|
|
|
134
141
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -171,6 +178,10 @@ class SummarizationMetric(BaseMetric):
|
|
|
171
178
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
172
179
|
],
|
|
173
180
|
)
|
|
181
|
+
if _log_metric_to_confident:
|
|
182
|
+
metric_data_manager.post_metric_if_enabled(
|
|
183
|
+
self, test_case=test_case
|
|
184
|
+
)
|
|
174
185
|
|
|
175
186
|
return self.score
|
|
176
187
|
|
|
@@ -50,6 +50,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
50
50
|
test_case: LLMTestCase,
|
|
51
51
|
_show_indicator: bool = True,
|
|
52
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
53
54
|
) -> float:
|
|
54
55
|
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
55
56
|
if not has_trace:
|
|
@@ -66,6 +67,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
66
67
|
test_case,
|
|
67
68
|
_show_indicator=False,
|
|
68
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
71
|
)
|
|
70
72
|
)
|
|
71
73
|
else:
|
|
@@ -89,6 +91,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
89
91
|
test_case: LLMTestCase,
|
|
90
92
|
_show_indicator: bool = True,
|
|
91
93
|
_in_component: bool = False,
|
|
94
|
+
_log_metric_to_confident: bool = True,
|
|
92
95
|
) -> float:
|
|
93
96
|
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
94
97
|
if not has_trace:
|
|
@@ -12,6 +12,7 @@ from deepeval.test_case import (
|
|
|
12
12
|
ToolCall,
|
|
13
13
|
)
|
|
14
14
|
from deepeval.metrics import BaseMetric
|
|
15
|
+
from deepeval.metrics.api import metric_data_manager
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class ToolCorrectnessMetric(BaseMetric):
|
|
@@ -45,6 +46,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
45
46
|
test_case: LLMTestCase,
|
|
46
47
|
_show_indicator: bool = True,
|
|
47
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
48
50
|
) -> float:
|
|
49
51
|
|
|
50
52
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -83,6 +85,11 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
83
85
|
]
|
|
84
86
|
steps.append(f"Score: {self.score}\nReason: {self.reason}")
|
|
85
87
|
self.verbose_logs = construct_verbose_logs(self, steps=steps)
|
|
88
|
+
|
|
89
|
+
if _log_metric_to_confident:
|
|
90
|
+
metric_data_manager.post_metric_if_enabled(
|
|
91
|
+
self, test_case=test_case
|
|
92
|
+
)
|
|
86
93
|
return self.score
|
|
87
94
|
|
|
88
95
|
async def a_measure(
|
|
@@ -90,6 +97,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
90
97
|
test_case: LLMTestCase,
|
|
91
98
|
_show_indicator: bool = True,
|
|
92
99
|
_in_component: bool = False,
|
|
100
|
+
_log_metric_to_confident: bool = True,
|
|
93
101
|
) -> float:
|
|
94
102
|
return self.measure(
|
|
95
103
|
test_case,
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.toxicity.template import ToxicityTemplate
|
|
19
19
|
from deepeval.metrics.toxicity.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class ToxicityMetric(BaseMetric):
|
|
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
50
51
|
test_case: LLMTestCase,
|
|
51
52
|
_show_indicator: bool = True,
|
|
52
53
|
_in_component: bool = False,
|
|
54
|
+
_log_metric_to_confident: bool = True,
|
|
53
55
|
) -> float:
|
|
54
56
|
|
|
55
57
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
125
137
|
|
|
126
138
|
return self.score
|
|
127
139
|
|
|
@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
20
20
|
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
|
|
21
21
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
22
22
|
from deepeval.metrics.turn_relevancy.schema import *
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class TurnRelevancyMetric(BaseConversationalMetric):
|
|
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
49
50
|
test_case: ConversationalTestCase,
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
52
54
|
):
|
|
53
55
|
check_conversational_test_case_params(
|
|
54
56
|
test_case, self._required_test_case_params, self
|
|
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
91
94
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
95
|
],
|
|
93
96
|
)
|
|
97
|
+
if _log_metric_to_confident:
|
|
98
|
+
metric_data_manager.post_metric_if_enabled(
|
|
99
|
+
self, test_case=test_case
|
|
100
|
+
)
|
|
94
101
|
return self.score
|
|
95
102
|
|
|
96
103
|
async def a_measure(
|
|
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
98
105
|
test_case: ConversationalTestCase,
|
|
99
106
|
_show_indicator: bool = True,
|
|
100
107
|
_in_component: bool = False,
|
|
108
|
+
_log_metric_to_confident: bool = True,
|
|
101
109
|
) -> float:
|
|
102
110
|
check_conversational_test_case_params(
|
|
103
111
|
test_case, self._required_test_case_params, self
|
|
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
134
142
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
135
143
|
],
|
|
136
144
|
)
|
|
145
|
+
if _log_metric_to_confident:
|
|
146
|
+
metric_data_manager.post_metric_if_enabled(
|
|
147
|
+
self, test_case=test_case
|
|
148
|
+
)
|
|
137
149
|
return self.score
|
|
138
150
|
|
|
139
151
|
async def _a_generate_reason(self) -> str:
|
|
@@ -56,8 +56,8 @@ model_pricing = {
|
|
|
56
56
|
class GrokModel(DeepEvalBaseLLM):
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
api_key: Optional[str] = None,
|
|
60
59
|
model: Optional[str] = None,
|
|
60
|
+
api_key: Optional[str] = None,
|
|
61
61
|
temperature: float = 0,
|
|
62
62
|
generation_kwargs: Optional[Dict] = None,
|
|
63
63
|
**kwargs,
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
|
|
|
33
33
|
|
|
34
34
|
from __future__ import annotations
|
|
35
35
|
|
|
36
|
+
import asyncio
|
|
37
|
+
import inspect
|
|
38
|
+
import itertools
|
|
39
|
+
import functools
|
|
40
|
+
import threading
|
|
36
41
|
import logging
|
|
37
42
|
|
|
38
|
-
from deepeval.utils import read_env_int, read_env_float
|
|
39
43
|
from dataclasses import dataclass, field
|
|
40
44
|
from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
|
|
41
45
|
from collections.abc import Mapping as ABCMapping
|
|
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
|
|
|
58
62
|
|
|
59
63
|
logger = logging.getLogger(__name__)
|
|
60
64
|
Provider = Union[str, PS]
|
|
65
|
+
_MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
|
|
66
|
+
_TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
|
|
67
|
+
_WORKER_ID = itertools.count(1)
|
|
61
68
|
|
|
62
69
|
# --------------------------
|
|
63
70
|
# Policy description
|
|
@@ -184,6 +191,12 @@ def extract_error_code(
|
|
|
184
191
|
# Predicate factory
|
|
185
192
|
# --------------------------
|
|
186
193
|
|
|
194
|
+
_BUILTIN_TIMEOUT_EXCS = (
|
|
195
|
+
(TimeoutError,)
|
|
196
|
+
if asyncio.TimeoutError is TimeoutError
|
|
197
|
+
else (TimeoutError, asyncio.TimeoutError)
|
|
198
|
+
)
|
|
199
|
+
|
|
187
200
|
|
|
188
201
|
def make_is_transient(
|
|
189
202
|
policy: ErrorPolicy,
|
|
@@ -213,6 +226,9 @@ def make_is_transient(
|
|
|
213
226
|
)
|
|
214
227
|
|
|
215
228
|
def _pred(e: Exception) -> bool:
|
|
229
|
+
if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
|
|
230
|
+
return True
|
|
231
|
+
|
|
216
232
|
if isinstance(e, policy.auth_excs):
|
|
217
233
|
return False
|
|
218
234
|
|
|
@@ -245,18 +261,23 @@ def make_is_transient(
|
|
|
245
261
|
|
|
246
262
|
class StopFromEnv(stop_base):
|
|
247
263
|
def __call__(self, retry_state):
|
|
248
|
-
|
|
264
|
+
settings = get_settings()
|
|
265
|
+
attempts = (
|
|
266
|
+
settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
|
|
267
|
+
) # TODO: add constraints in settings
|
|
249
268
|
return stop_after_attempt(attempts)(retry_state)
|
|
250
269
|
|
|
251
270
|
|
|
252
271
|
class WaitFromEnv(wait_base):
|
|
253
272
|
def __call__(self, retry_state):
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
273
|
+
settings = get_settings()
|
|
274
|
+
initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
|
|
275
|
+
exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
|
|
276
|
+
jitter = settings.DEEPEVAL_RETRY_JITTER
|
|
277
|
+
cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
|
|
278
|
+
|
|
279
|
+
if cap == 0: # <- 0 means no backoff sleeps or jitter
|
|
280
|
+
return 0
|
|
260
281
|
return wait_exponential_jitter(
|
|
261
282
|
initial=initial, exp_base=exp_base, jitter=jitter, max=cap
|
|
262
283
|
)(retry_state)
|
|
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
|
|
|
324
345
|
|
|
325
346
|
def _retry_log_levels():
|
|
326
347
|
s = get_settings()
|
|
348
|
+
base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
|
|
327
349
|
before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
|
|
328
350
|
after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
|
|
329
351
|
return (
|
|
330
|
-
before_level if before_level is not None else
|
|
352
|
+
before_level if before_level is not None else base_level,
|
|
331
353
|
after_level if after_level is not None else logging.ERROR,
|
|
332
354
|
)
|
|
333
355
|
|
|
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
|
|
|
394
416
|
return _after
|
|
395
417
|
|
|
396
418
|
|
|
419
|
+
def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
420
|
+
settings = get_settings()
|
|
421
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
422
|
+
logger.debug(
|
|
423
|
+
"retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
|
|
424
|
+
timeout_seconds,
|
|
425
|
+
settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
|
|
426
|
+
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
427
|
+
)
|
|
428
|
+
msg = (
|
|
429
|
+
f"call timed out after {timeout_seconds:g}s (per attempt). "
|
|
430
|
+
"Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
|
|
431
|
+
)
|
|
432
|
+
return TimeoutError(msg)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
436
|
+
"""
|
|
437
|
+
Run a synchronous callable with a soft timeout enforced by a helper thread,
|
|
438
|
+
with a global cap on concurrent timeout-workers.
|
|
439
|
+
|
|
440
|
+
How it works
|
|
441
|
+
------------
|
|
442
|
+
- A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
|
|
443
|
+
gates creation of timeout worker threads. If no permit is available, this call
|
|
444
|
+
blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
|
|
445
|
+
> 0 and acquisition takes longer than that, a warning is logged before continuing
|
|
446
|
+
to wait.
|
|
447
|
+
- Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
|
|
448
|
+
- We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
|
|
449
|
+
`TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
|
|
450
|
+
- If the worker finishes in time, we return its result or re-raise its exception
|
|
451
|
+
(with original traceback).
|
|
452
|
+
|
|
453
|
+
Cancellation semantics
|
|
454
|
+
----------------------
|
|
455
|
+
This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
|
|
456
|
+
are rare this is fine. If timeouts are common, consider moving to:
|
|
457
|
+
- a shared ThreadPoolExecutor (caps threads and amortizes creation), or
|
|
458
|
+
- worker process (supports killing in-flight processes)
|
|
459
|
+
|
|
460
|
+
Concurrency control & logging
|
|
461
|
+
-----------------------------
|
|
462
|
+
- Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
|
|
463
|
+
- If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
|
|
464
|
+
warning and then block until a slot is available.
|
|
465
|
+
- On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
|
|
466
|
+
thread sample to help diagnose pressure.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
func: Synchronous callable to execute.
|
|
470
|
+
timeout_seconds: Float seconds for the soft timeout (0/None disables).
|
|
471
|
+
*args, **kwargs: Passed through to `func`.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Whatever `func` returns.
|
|
475
|
+
|
|
476
|
+
Raises:
|
|
477
|
+
TimeoutError: If `timeout_seconds` elapse before completion.
|
|
478
|
+
BaseException: If `func` raises, the same exception is re-raised with its
|
|
479
|
+
original traceback.
|
|
480
|
+
"""
|
|
481
|
+
if not timeout_seconds or timeout_seconds <= 0:
|
|
482
|
+
return func(*args, **kwargs)
|
|
483
|
+
|
|
484
|
+
# try to respect the global cap on concurrent timeout workers
|
|
485
|
+
warn_after = float(
|
|
486
|
+
get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
|
|
487
|
+
)
|
|
488
|
+
if warn_after > 0:
|
|
489
|
+
acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
|
|
490
|
+
if not acquired:
|
|
491
|
+
logger.warning(
|
|
492
|
+
"timeout thread limit reached (%d); waiting for a slot...",
|
|
493
|
+
_MAX_TIMEOUT_THREADS,
|
|
494
|
+
)
|
|
495
|
+
_TIMEOUT_SEMA.acquire()
|
|
496
|
+
else:
|
|
497
|
+
_TIMEOUT_SEMA.acquire()
|
|
498
|
+
|
|
499
|
+
done = threading.Event()
|
|
500
|
+
result = {"value": None, "exc": None}
|
|
501
|
+
|
|
502
|
+
def target():
|
|
503
|
+
try:
|
|
504
|
+
result["value"] = func(*args, **kwargs)
|
|
505
|
+
except BaseException as e:
|
|
506
|
+
result["exc"] = e
|
|
507
|
+
finally:
|
|
508
|
+
done.set()
|
|
509
|
+
_TIMEOUT_SEMA.release()
|
|
510
|
+
|
|
511
|
+
t = threading.Thread(
|
|
512
|
+
target=target,
|
|
513
|
+
daemon=True,
|
|
514
|
+
name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
t.start()
|
|
519
|
+
except BaseException:
|
|
520
|
+
_TIMEOUT_SEMA.release()
|
|
521
|
+
raise
|
|
522
|
+
|
|
523
|
+
finished = done.wait(timeout_seconds)
|
|
524
|
+
if not finished:
|
|
525
|
+
if (
|
|
526
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
527
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
528
|
+
):
|
|
529
|
+
names = [th.name for th in threading.enumerate()[:10]]
|
|
530
|
+
logger.debug(
|
|
531
|
+
"timeout after %.3fs (active_threads=%d, sample=%s)",
|
|
532
|
+
timeout_seconds,
|
|
533
|
+
threading.active_count(),
|
|
534
|
+
names,
|
|
535
|
+
)
|
|
536
|
+
raise _make_timeout_error(timeout_seconds)
|
|
537
|
+
|
|
538
|
+
# Completed within time: return or raise
|
|
539
|
+
if result["exc"] is not None:
|
|
540
|
+
exc = result["exc"]
|
|
541
|
+
raise exc.with_traceback(getattr(exc, "__traceback__", None))
|
|
542
|
+
return result["value"]
|
|
543
|
+
|
|
544
|
+
|
|
397
545
|
def create_retry_decorator(provider: Provider):
|
|
398
546
|
"""
|
|
399
547
|
Build a Tenacity @retry decorator wired to our dynamic retry policy
|
|
400
548
|
for the given provider slug.
|
|
401
549
|
"""
|
|
402
550
|
slug = slugify(provider)
|
|
403
|
-
|
|
404
|
-
return retry(
|
|
551
|
+
base_retry = retry(
|
|
405
552
|
wait=dynamic_wait(),
|
|
406
553
|
stop=dynamic_stop(),
|
|
407
554
|
retry=dynamic_retry(slug),
|
|
408
555
|
before_sleep=make_before_sleep_log(slug),
|
|
409
556
|
after=make_after_log(slug),
|
|
557
|
+
reraise=False,
|
|
410
558
|
)
|
|
411
559
|
|
|
560
|
+
def _decorator(func):
|
|
561
|
+
if inspect.iscoroutinefunction(func):
|
|
562
|
+
|
|
563
|
+
@functools.wraps(func)
|
|
564
|
+
async def attempt(*args, **kwargs):
|
|
565
|
+
timeout_seconds = (
|
|
566
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
567
|
+
)
|
|
568
|
+
coro = func(*args, **kwargs)
|
|
569
|
+
if timeout_seconds > 0:
|
|
570
|
+
try:
|
|
571
|
+
return await asyncio.wait_for(coro, timeout_seconds)
|
|
572
|
+
except asyncio.TimeoutError as e:
|
|
573
|
+
if (
|
|
574
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
575
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE is True
|
|
576
|
+
):
|
|
577
|
+
logger.debug(
|
|
578
|
+
"async timeout after %.3fs (active_threads=%d, tasks=%d)",
|
|
579
|
+
timeout_seconds,
|
|
580
|
+
threading.active_count(),
|
|
581
|
+
len(asyncio.all_tasks()),
|
|
582
|
+
)
|
|
583
|
+
raise _make_timeout_error(timeout_seconds) from e
|
|
584
|
+
return await coro
|
|
585
|
+
|
|
586
|
+
return base_retry(attempt)
|
|
587
|
+
|
|
588
|
+
@functools.wraps(func)
|
|
589
|
+
def attempt(*args, **kwargs):
|
|
590
|
+
timeout_seconds = (
|
|
591
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
592
|
+
)
|
|
593
|
+
if timeout_seconds > 0:
|
|
594
|
+
return _run_sync_with_timeout(
|
|
595
|
+
func, timeout_seconds, *args, **kwargs
|
|
596
|
+
)
|
|
597
|
+
return func(*args, **kwargs)
|
|
598
|
+
|
|
599
|
+
return base_retry(attempt)
|
|
600
|
+
|
|
601
|
+
return _decorator
|
|
602
|
+
|
|
412
603
|
|
|
413
604
|
def _httpx_net_excs() -> tuple[type, ...]:
|
|
414
605
|
try:
|
deepeval/openai/__init__.py
CHANGED
|
@@ -1,37 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
try:
|
|
2
|
+
import openai # noqa: F401
|
|
3
|
+
except ImportError:
|
|
4
|
+
raise ModuleNotFoundError(
|
|
5
|
+
"Please install OpenAI to use this feature: 'pip install openai'"
|
|
6
|
+
)
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
|
|
14
|
-
new_spec = importlib.util.spec_from_loader(
|
|
15
|
-
"deepeval_openai",
|
|
16
|
-
loader,
|
|
17
|
-
origin=openai_spec.origin,
|
|
18
|
-
is_package=True,
|
|
19
|
-
)
|
|
20
|
-
deepeval_openai = importlib.util.module_from_spec(new_spec)
|
|
21
|
-
deepeval_openai.__path__ = package_dirs
|
|
22
|
-
sys.modules["deepeval_openai"] = deepeval_openai
|
|
23
|
-
loader.exec_module(deepeval_openai)
|
|
24
|
-
patch_openai(deepeval_openai)
|
|
25
|
-
return deepeval_openai
|
|
9
|
+
try:
|
|
10
|
+
from openai import OpenAI, AsyncOpenAI # noqa: F401
|
|
11
|
+
except ImportError:
|
|
12
|
+
OpenAI = None # type: ignore
|
|
13
|
+
AsyncOpenAI = None # type: ignore
|
|
26
14
|
|
|
27
15
|
|
|
28
|
-
|
|
29
|
-
openai
|
|
30
|
-
OpenAI = patched_openai.OpenAI
|
|
31
|
-
AsyncOpenAI = patched_openai.AsyncOpenAI
|
|
16
|
+
if OpenAI or AsyncOpenAI:
|
|
17
|
+
from deepeval.openai.patch import patch_openai_classes
|
|
32
18
|
|
|
33
|
-
|
|
34
|
-
"openai",
|
|
35
|
-
"OpenAI",
|
|
36
|
-
"AsyncOpenAI",
|
|
37
|
-
]
|
|
19
|
+
patch_openai_classes()
|