deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +167 -12
- deepeval/dataset/dataset.py +8 -2
- deepeval/evaluate/evaluate.py +8 -2
- deepeval/evaluate/execute.py +28 -30
- deepeval/evaluate/types.py +4 -1
- deepeval/evaluate/utils.py +46 -29
- deepeval/integrations/crewai/__init__.py +1 -2
- deepeval/integrations/crewai/handler.py +153 -81
- deepeval/integrations/crewai/wrapper.py +87 -0
- deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- deepeval/metrics/faithfulness/faithfulness.py +8 -0
- deepeval/metrics/g_eval/g_eval.py +26 -15
- deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
- deepeval/models/retry_policy.py +202 -11
- deepeval/test_run/__init__.py +2 -1
- deepeval/test_run/api.py +1 -0
- deepeval/test_run/test_run.py +85 -9
- deepeval/tracing/__init__.py +2 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/test_exporter.py +35 -0
- deepeval/tracing/otel/utils.py +57 -7
- deepeval/tracing/trace_context.py +14 -0
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +7 -6
- deepeval/tracing/utils.py +2 -86
- deepeval/utils.py +149 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
- deepeval/integrations/crewai/agent.py +0 -98
- deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
3
5
|
from typing import Optional, List, Tuple, Union, Type
|
|
4
6
|
from deepeval.metrics import BaseMetric
|
|
5
7
|
from deepeval.test_case import (
|
|
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
|
|
|
16
18
|
)
|
|
17
19
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
from deepeval.metrics.g_eval
|
|
21
|
+
from deepeval.metrics.g_eval import schema as gschema
|
|
20
22
|
from deepeval.metrics.g_eval.utils import (
|
|
21
23
|
Rubric,
|
|
22
24
|
construct_g_eval_params_string,
|
|
@@ -29,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
29
31
|
number_evaluation_steps,
|
|
30
32
|
get_score_range,
|
|
31
33
|
)
|
|
34
|
+
from deepeval.config.settings import get_settings
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
class GEval(BaseMetric):
|
|
@@ -81,12 +84,16 @@ class GEval(BaseMetric):
|
|
|
81
84
|
):
|
|
82
85
|
if self.async_mode:
|
|
83
86
|
loop = get_or_create_event_loop()
|
|
87
|
+
coro = self.a_measure(
|
|
88
|
+
test_case,
|
|
89
|
+
_show_indicator=False,
|
|
90
|
+
_in_component=_in_component,
|
|
91
|
+
_additional_context=_additional_context,
|
|
92
|
+
)
|
|
84
93
|
loop.run_until_complete(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
_in_component=_in_component,
|
|
89
|
-
_additional_context=_additional_context,
|
|
94
|
+
asyncio.wait_for(
|
|
95
|
+
coro,
|
|
96
|
+
timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
90
97
|
)
|
|
91
98
|
)
|
|
92
99
|
else:
|
|
@@ -177,7 +184,9 @@ class GEval(BaseMetric):
|
|
|
177
184
|
return data["steps"]
|
|
178
185
|
else:
|
|
179
186
|
try:
|
|
180
|
-
res: Steps = await self.model.a_generate(
|
|
187
|
+
res: gschema.Steps = await self.model.a_generate(
|
|
188
|
+
prompt, schema=gschema.Steps
|
|
189
|
+
)
|
|
181
190
|
return res.steps
|
|
182
191
|
except TypeError:
|
|
183
192
|
res = await self.model.a_generate(prompt)
|
|
@@ -201,7 +210,9 @@ class GEval(BaseMetric):
|
|
|
201
210
|
return data["steps"]
|
|
202
211
|
else:
|
|
203
212
|
try:
|
|
204
|
-
res: Steps = self.model.generate(
|
|
213
|
+
res: gschema.Steps = self.model.generate(
|
|
214
|
+
prompt, schema=gschema.Steps
|
|
215
|
+
)
|
|
205
216
|
return res.steps
|
|
206
217
|
except TypeError:
|
|
207
218
|
res = self.model.generate(prompt)
|
|
@@ -264,7 +275,7 @@ class GEval(BaseMetric):
|
|
|
264
275
|
score, res
|
|
265
276
|
)
|
|
266
277
|
return weighted_summed_score, reason
|
|
267
|
-
except:
|
|
278
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
268
279
|
return score, reason
|
|
269
280
|
except (
|
|
270
281
|
AttributeError
|
|
@@ -276,8 +287,8 @@ class GEval(BaseMetric):
|
|
|
276
287
|
return data["score"], data["reason"]
|
|
277
288
|
else:
|
|
278
289
|
try:
|
|
279
|
-
res: ReasonScore = await self.model.a_generate(
|
|
280
|
-
prompt, schema=ReasonScore
|
|
290
|
+
res: gschema.ReasonScore = await self.model.a_generate(
|
|
291
|
+
prompt, schema=gschema.ReasonScore
|
|
281
292
|
)
|
|
282
293
|
return res.score, res.reason
|
|
283
294
|
except TypeError:
|
|
@@ -338,7 +349,7 @@ class GEval(BaseMetric):
|
|
|
338
349
|
score, res
|
|
339
350
|
)
|
|
340
351
|
return weighted_summed_score, reason
|
|
341
|
-
except:
|
|
352
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
342
353
|
return score, reason
|
|
343
354
|
except AttributeError:
|
|
344
355
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
@@ -349,8 +360,8 @@ class GEval(BaseMetric):
|
|
|
349
360
|
return data["score"], data["reason"]
|
|
350
361
|
else:
|
|
351
362
|
try:
|
|
352
|
-
res: ReasonScore = self.model.generate(
|
|
353
|
-
prompt, schema=ReasonScore
|
|
363
|
+
res: gschema.ReasonScore = self.model.generate(
|
|
364
|
+
prompt, schema=gschema.ReasonScore
|
|
354
365
|
)
|
|
355
366
|
return res.score, res.reason
|
|
356
367
|
except TypeError:
|
|
@@ -364,7 +375,7 @@ class GEval(BaseMetric):
|
|
|
364
375
|
else:
|
|
365
376
|
try:
|
|
366
377
|
self.success = self.score >= self.threshold
|
|
367
|
-
except:
|
|
378
|
+
except TypeError:
|
|
368
379
|
self.success = False
|
|
369
380
|
return self.success
|
|
370
381
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
1
3
|
from typing import Optional, List, Union
|
|
2
4
|
|
|
3
5
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
@@ -15,7 +17,8 @@ from deepeval.metrics import BaseMetric
|
|
|
15
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
18
|
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
|
|
17
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.prompt_alignment
|
|
20
|
+
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
21
|
+
from deepeval.config.settings import get_settings
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
class PromptAlignmentMetric(BaseMetric):
|
|
@@ -62,15 +65,19 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
62
65
|
):
|
|
63
66
|
if self.async_mode:
|
|
64
67
|
loop = get_or_create_event_loop()
|
|
68
|
+
coro = self.a_measure(
|
|
69
|
+
test_case,
|
|
70
|
+
_show_indicator=False,
|
|
71
|
+
_in_component=_in_component,
|
|
72
|
+
)
|
|
65
73
|
loop.run_until_complete(
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
_in_component=_in_component,
|
|
74
|
+
asyncio.wait_for(
|
|
75
|
+
coro,
|
|
76
|
+
timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
70
77
|
)
|
|
71
78
|
)
|
|
72
79
|
else:
|
|
73
|
-
self.verdicts: Verdicts = self._generate_verdicts(
|
|
80
|
+
self.verdicts: paschema.Verdicts = self._generate_verdicts(
|
|
74
81
|
test_case.input, test_case.actual_output
|
|
75
82
|
)
|
|
76
83
|
self.score = self._calculate_score()
|
|
@@ -105,7 +112,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
105
112
|
_show_indicator=_show_indicator,
|
|
106
113
|
_in_component=_in_component,
|
|
107
114
|
):
|
|
108
|
-
self.verdicts: Verdicts = await self._a_generate_verdicts(
|
|
115
|
+
self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
|
|
109
116
|
test_case.input, test_case.actual_output
|
|
110
117
|
)
|
|
111
118
|
self.score = self._calculate_score()
|
|
@@ -141,14 +148,17 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
141
148
|
)
|
|
142
149
|
if self.using_native_model:
|
|
143
150
|
res, cost = await self.model.a_generate(
|
|
144
|
-
prompt, schema=PromptAlignmentScoreReason
|
|
151
|
+
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
145
152
|
)
|
|
146
153
|
self.evaluation_cost += cost
|
|
147
154
|
return res.reason
|
|
148
155
|
else:
|
|
149
156
|
try:
|
|
150
|
-
res: PromptAlignmentScoreReason =
|
|
151
|
-
|
|
157
|
+
res: paschema.PromptAlignmentScoreReason = (
|
|
158
|
+
await self.model.a_generate(
|
|
159
|
+
prompt=prompt,
|
|
160
|
+
schema=paschema.PromptAlignmentScoreReason,
|
|
161
|
+
)
|
|
152
162
|
)
|
|
153
163
|
return res.reason
|
|
154
164
|
except TypeError:
|
|
@@ -173,14 +183,14 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
173
183
|
)
|
|
174
184
|
if self.using_native_model:
|
|
175
185
|
res, cost = self.model.generate(
|
|
176
|
-
prompt, schema=PromptAlignmentScoreReason
|
|
186
|
+
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
177
187
|
)
|
|
178
188
|
self.evaluation_cost += cost
|
|
179
189
|
return res.reason
|
|
180
190
|
else:
|
|
181
191
|
try:
|
|
182
|
-
res: PromptAlignmentScoreReason = self.model.generate(
|
|
183
|
-
prompt=prompt, schema=PromptAlignmentScoreReason
|
|
192
|
+
res: paschema.PromptAlignmentScoreReason = self.model.generate(
|
|
193
|
+
prompt=prompt, schema=paschema.PromptAlignmentScoreReason
|
|
184
194
|
)
|
|
185
195
|
return res.reason
|
|
186
196
|
except TypeError:
|
|
@@ -190,48 +200,56 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
190
200
|
|
|
191
201
|
async def _a_generate_verdicts(
|
|
192
202
|
self, input: str, actual_output: str
|
|
193
|
-
) -> Verdicts:
|
|
203
|
+
) -> paschema.Verdicts:
|
|
194
204
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
195
205
|
prompt_instructions=self.prompt_instructions,
|
|
196
206
|
input=input,
|
|
197
207
|
actual_output=actual_output,
|
|
198
208
|
)
|
|
199
209
|
if self.using_native_model:
|
|
200
|
-
res, cost = await self.model.a_generate(
|
|
210
|
+
res, cost = await self.model.a_generate(
|
|
211
|
+
prompt, schema=paschema.Verdicts
|
|
212
|
+
)
|
|
201
213
|
self.evaluation_cost += cost
|
|
202
214
|
return [item for item in res.verdicts]
|
|
203
215
|
else:
|
|
204
216
|
try:
|
|
205
|
-
res: Verdicts = await self.model.a_generate(
|
|
206
|
-
prompt, schema=Verdicts
|
|
217
|
+
res: paschema.Verdicts = await self.model.a_generate(
|
|
218
|
+
prompt, schema=paschema.Verdicts
|
|
207
219
|
)
|
|
208
220
|
return [item for item in res.verdicts]
|
|
209
221
|
except TypeError:
|
|
210
222
|
res = await self.model.a_generate(prompt)
|
|
211
223
|
data = trimAndLoadJson(res, self)
|
|
212
224
|
return [
|
|
213
|
-
PromptAlignmentVerdict(**item)
|
|
225
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
226
|
+
for item in data["verdicts"]
|
|
214
227
|
]
|
|
215
228
|
|
|
216
|
-
def _generate_verdicts(
|
|
229
|
+
def _generate_verdicts(
|
|
230
|
+
self, input: str, actual_output: str
|
|
231
|
+
) -> paschema.Verdicts:
|
|
217
232
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
218
233
|
prompt_instructions=self.prompt_instructions,
|
|
219
234
|
input=input,
|
|
220
235
|
actual_output=actual_output,
|
|
221
236
|
)
|
|
222
237
|
if self.using_native_model:
|
|
223
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
238
|
+
res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
|
|
224
239
|
self.evaluation_cost += cost
|
|
225
240
|
return [item for item in res.verdicts]
|
|
226
241
|
else:
|
|
227
242
|
try:
|
|
228
|
-
res: Verdicts = self.model.generate(
|
|
243
|
+
res: paschema.Verdicts = self.model.generate(
|
|
244
|
+
prompt, schema=paschema.Verdicts
|
|
245
|
+
)
|
|
229
246
|
return [item for item in res.verdicts]
|
|
230
247
|
except TypeError:
|
|
231
248
|
res = self.model.generate(prompt)
|
|
232
249
|
data = trimAndLoadJson(res, self)
|
|
233
250
|
return [
|
|
234
|
-
PromptAlignmentVerdict(**item)
|
|
251
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
252
|
+
for item in data["verdicts"]
|
|
235
253
|
]
|
|
236
254
|
|
|
237
255
|
def _calculate_score(self):
|
|
@@ -253,7 +271,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
253
271
|
else:
|
|
254
272
|
try:
|
|
255
273
|
self.success = self.score >= self.threshold
|
|
256
|
-
except:
|
|
274
|
+
except TypeError:
|
|
257
275
|
self.success = False
|
|
258
276
|
return self.success
|
|
259
277
|
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
|
|
|
33
33
|
|
|
34
34
|
from __future__ import annotations
|
|
35
35
|
|
|
36
|
+
import asyncio
|
|
37
|
+
import inspect
|
|
38
|
+
import itertools
|
|
39
|
+
import functools
|
|
40
|
+
import threading
|
|
36
41
|
import logging
|
|
37
42
|
|
|
38
|
-
from deepeval.utils import read_env_int, read_env_float
|
|
39
43
|
from dataclasses import dataclass, field
|
|
40
44
|
from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
|
|
41
45
|
from collections.abc import Mapping as ABCMapping
|
|
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
|
|
|
58
62
|
|
|
59
63
|
logger = logging.getLogger(__name__)
|
|
60
64
|
Provider = Union[str, PS]
|
|
65
|
+
_MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
|
|
66
|
+
_TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
|
|
67
|
+
_WORKER_ID = itertools.count(1)
|
|
61
68
|
|
|
62
69
|
# --------------------------
|
|
63
70
|
# Policy description
|
|
@@ -184,6 +191,12 @@ def extract_error_code(
|
|
|
184
191
|
# Predicate factory
|
|
185
192
|
# --------------------------
|
|
186
193
|
|
|
194
|
+
_BUILTIN_TIMEOUT_EXCS = (
|
|
195
|
+
(TimeoutError,)
|
|
196
|
+
if asyncio.TimeoutError is TimeoutError
|
|
197
|
+
else (TimeoutError, asyncio.TimeoutError)
|
|
198
|
+
)
|
|
199
|
+
|
|
187
200
|
|
|
188
201
|
def make_is_transient(
|
|
189
202
|
policy: ErrorPolicy,
|
|
@@ -213,6 +226,9 @@ def make_is_transient(
|
|
|
213
226
|
)
|
|
214
227
|
|
|
215
228
|
def _pred(e: Exception) -> bool:
|
|
229
|
+
if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
|
|
230
|
+
return True
|
|
231
|
+
|
|
216
232
|
if isinstance(e, policy.auth_excs):
|
|
217
233
|
return False
|
|
218
234
|
|
|
@@ -245,18 +261,23 @@ def make_is_transient(
|
|
|
245
261
|
|
|
246
262
|
class StopFromEnv(stop_base):
|
|
247
263
|
def __call__(self, retry_state):
|
|
248
|
-
|
|
264
|
+
settings = get_settings()
|
|
265
|
+
attempts = (
|
|
266
|
+
settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
|
|
267
|
+
) # TODO: add constraints in settings
|
|
249
268
|
return stop_after_attempt(attempts)(retry_state)
|
|
250
269
|
|
|
251
270
|
|
|
252
271
|
class WaitFromEnv(wait_base):
|
|
253
272
|
def __call__(self, retry_state):
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
273
|
+
settings = get_settings()
|
|
274
|
+
initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
|
|
275
|
+
exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
|
|
276
|
+
jitter = settings.DEEPEVAL_RETRY_JITTER
|
|
277
|
+
cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
|
|
278
|
+
|
|
279
|
+
if cap == 0: # <- 0 means no backoff sleeps or jitter
|
|
280
|
+
return 0
|
|
260
281
|
return wait_exponential_jitter(
|
|
261
282
|
initial=initial, exp_base=exp_base, jitter=jitter, max=cap
|
|
262
283
|
)(retry_state)
|
|
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
|
|
|
324
345
|
|
|
325
346
|
def _retry_log_levels():
|
|
326
347
|
s = get_settings()
|
|
348
|
+
base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
|
|
327
349
|
before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
|
|
328
350
|
after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
|
|
329
351
|
return (
|
|
330
|
-
before_level if before_level is not None else
|
|
352
|
+
before_level if before_level is not None else base_level,
|
|
331
353
|
after_level if after_level is not None else logging.ERROR,
|
|
332
354
|
)
|
|
333
355
|
|
|
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
|
|
|
394
416
|
return _after
|
|
395
417
|
|
|
396
418
|
|
|
419
|
+
def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
420
|
+
settings = get_settings()
|
|
421
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
422
|
+
logger.debug(
|
|
423
|
+
"retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
|
|
424
|
+
timeout_seconds,
|
|
425
|
+
settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
|
|
426
|
+
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
427
|
+
)
|
|
428
|
+
msg = (
|
|
429
|
+
f"call timed out after {timeout_seconds:g}s (per attempt). "
|
|
430
|
+
"Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
|
|
431
|
+
)
|
|
432
|
+
return TimeoutError(msg)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
436
|
+
"""
|
|
437
|
+
Run a synchronous callable with a soft timeout enforced by a helper thread,
|
|
438
|
+
with a global cap on concurrent timeout-workers.
|
|
439
|
+
|
|
440
|
+
How it works
|
|
441
|
+
------------
|
|
442
|
+
- A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
|
|
443
|
+
gates creation of timeout worker threads. If no permit is available, this call
|
|
444
|
+
blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
|
|
445
|
+
> 0 and acquisition takes longer than that, a warning is logged before continuing
|
|
446
|
+
to wait.
|
|
447
|
+
- Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
|
|
448
|
+
- We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
|
|
449
|
+
`TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
|
|
450
|
+
- If the worker finishes in time, we return its result or re-raise its exception
|
|
451
|
+
(with original traceback).
|
|
452
|
+
|
|
453
|
+
Cancellation semantics
|
|
454
|
+
----------------------
|
|
455
|
+
This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
|
|
456
|
+
are rare this is fine. If timeouts are common, consider moving to:
|
|
457
|
+
- a shared ThreadPoolExecutor (caps threads and amortizes creation), or
|
|
458
|
+
- worker process (supports killing in-flight processes)
|
|
459
|
+
|
|
460
|
+
Concurrency control & logging
|
|
461
|
+
-----------------------------
|
|
462
|
+
- Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
|
|
463
|
+
- If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
|
|
464
|
+
warning and then block until a slot is available.
|
|
465
|
+
- On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
|
|
466
|
+
thread sample to help diagnose pressure.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
func: Synchronous callable to execute.
|
|
470
|
+
timeout_seconds: Float seconds for the soft timeout (0/None disables).
|
|
471
|
+
*args, **kwargs: Passed through to `func`.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Whatever `func` returns.
|
|
475
|
+
|
|
476
|
+
Raises:
|
|
477
|
+
TimeoutError: If `timeout_seconds` elapse before completion.
|
|
478
|
+
BaseException: If `func` raises, the same exception is re-raised with its
|
|
479
|
+
original traceback.
|
|
480
|
+
"""
|
|
481
|
+
if not timeout_seconds or timeout_seconds <= 0:
|
|
482
|
+
return func(*args, **kwargs)
|
|
483
|
+
|
|
484
|
+
# try to respect the global cap on concurrent timeout workers
|
|
485
|
+
warn_after = float(
|
|
486
|
+
get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
|
|
487
|
+
)
|
|
488
|
+
if warn_after > 0:
|
|
489
|
+
acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
|
|
490
|
+
if not acquired:
|
|
491
|
+
logger.warning(
|
|
492
|
+
"timeout thread limit reached (%d); waiting for a slot...",
|
|
493
|
+
_MAX_TIMEOUT_THREADS,
|
|
494
|
+
)
|
|
495
|
+
_TIMEOUT_SEMA.acquire()
|
|
496
|
+
else:
|
|
497
|
+
_TIMEOUT_SEMA.acquire()
|
|
498
|
+
|
|
499
|
+
done = threading.Event()
|
|
500
|
+
result = {"value": None, "exc": None}
|
|
501
|
+
|
|
502
|
+
def target():
|
|
503
|
+
try:
|
|
504
|
+
result["value"] = func(*args, **kwargs)
|
|
505
|
+
except BaseException as e:
|
|
506
|
+
result["exc"] = e
|
|
507
|
+
finally:
|
|
508
|
+
done.set()
|
|
509
|
+
_TIMEOUT_SEMA.release()
|
|
510
|
+
|
|
511
|
+
t = threading.Thread(
|
|
512
|
+
target=target,
|
|
513
|
+
daemon=True,
|
|
514
|
+
name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
t.start()
|
|
519
|
+
except BaseException:
|
|
520
|
+
_TIMEOUT_SEMA.release()
|
|
521
|
+
raise
|
|
522
|
+
|
|
523
|
+
finished = done.wait(timeout_seconds)
|
|
524
|
+
if not finished:
|
|
525
|
+
if (
|
|
526
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
527
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
528
|
+
):
|
|
529
|
+
names = [th.name for th in threading.enumerate()[:10]]
|
|
530
|
+
logger.debug(
|
|
531
|
+
"timeout after %.3fs (active_threads=%d, sample=%s)",
|
|
532
|
+
timeout_seconds,
|
|
533
|
+
threading.active_count(),
|
|
534
|
+
names,
|
|
535
|
+
)
|
|
536
|
+
raise _make_timeout_error(timeout_seconds)
|
|
537
|
+
|
|
538
|
+
# Completed within time: return or raise
|
|
539
|
+
if result["exc"] is not None:
|
|
540
|
+
exc = result["exc"]
|
|
541
|
+
raise exc.with_traceback(getattr(exc, "__traceback__", None))
|
|
542
|
+
return result["value"]
|
|
543
|
+
|
|
544
|
+
|
|
397
545
|
def create_retry_decorator(provider: Provider):
|
|
398
546
|
"""
|
|
399
547
|
Build a Tenacity @retry decorator wired to our dynamic retry policy
|
|
400
548
|
for the given provider slug.
|
|
401
549
|
"""
|
|
402
550
|
slug = slugify(provider)
|
|
403
|
-
|
|
404
|
-
return retry(
|
|
551
|
+
base_retry = retry(
|
|
405
552
|
wait=dynamic_wait(),
|
|
406
553
|
stop=dynamic_stop(),
|
|
407
554
|
retry=dynamic_retry(slug),
|
|
408
555
|
before_sleep=make_before_sleep_log(slug),
|
|
409
556
|
after=make_after_log(slug),
|
|
557
|
+
reraise=False,
|
|
410
558
|
)
|
|
411
559
|
|
|
560
|
+
def _decorator(func):
|
|
561
|
+
if inspect.iscoroutinefunction(func):
|
|
562
|
+
|
|
563
|
+
@functools.wraps(func)
|
|
564
|
+
async def attempt(*args, **kwargs):
|
|
565
|
+
timeout_seconds = (
|
|
566
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
567
|
+
)
|
|
568
|
+
coro = func(*args, **kwargs)
|
|
569
|
+
if timeout_seconds > 0:
|
|
570
|
+
try:
|
|
571
|
+
return await asyncio.wait_for(coro, timeout_seconds)
|
|
572
|
+
except asyncio.TimeoutError as e:
|
|
573
|
+
if (
|
|
574
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
575
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE is True
|
|
576
|
+
):
|
|
577
|
+
logger.debug(
|
|
578
|
+
"async timeout after %.3fs (active_threads=%d, tasks=%d)",
|
|
579
|
+
timeout_seconds,
|
|
580
|
+
threading.active_count(),
|
|
581
|
+
len(asyncio.all_tasks()),
|
|
582
|
+
)
|
|
583
|
+
raise _make_timeout_error(timeout_seconds) from e
|
|
584
|
+
return await coro
|
|
585
|
+
|
|
586
|
+
return base_retry(attempt)
|
|
587
|
+
|
|
588
|
+
@functools.wraps(func)
|
|
589
|
+
def attempt(*args, **kwargs):
|
|
590
|
+
timeout_seconds = (
|
|
591
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
592
|
+
)
|
|
593
|
+
if timeout_seconds > 0:
|
|
594
|
+
return _run_sync_with_timeout(
|
|
595
|
+
func, timeout_seconds, *args, **kwargs
|
|
596
|
+
)
|
|
597
|
+
return func(*args, **kwargs)
|
|
598
|
+
|
|
599
|
+
return base_retry(attempt)
|
|
600
|
+
|
|
601
|
+
return _decorator
|
|
602
|
+
|
|
412
603
|
|
|
413
604
|
def _httpx_net_excs() -> tuple[type, ...]:
|
|
414
605
|
try:
|
deepeval/test_run/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ from .test_run import (
|
|
|
11
11
|
)
|
|
12
12
|
|
|
13
13
|
from .hooks import on_test_run_end, invoke_test_run_end_hook
|
|
14
|
-
from .api import MetricData
|
|
14
|
+
from .api import MetricData, TurnApi
|
|
15
15
|
from .hyperparameters import log_hyperparameters
|
|
16
16
|
|
|
17
17
|
|
|
@@ -28,5 +28,6 @@ __all__ = [
|
|
|
28
28
|
"on_test_run_end",
|
|
29
29
|
"invoke_test_run_end_hook",
|
|
30
30
|
"MetricData",
|
|
31
|
+
"TurnApi",
|
|
31
32
|
"log_hyperparameters",
|
|
32
33
|
]
|
deepeval/test_run/api.py
CHANGED
|
@@ -99,6 +99,7 @@ class TurnApi(BaseModel):
|
|
|
99
99
|
role: str
|
|
100
100
|
content: str
|
|
101
101
|
order: int
|
|
102
|
+
user_id: Optional[str] = Field(None, alias="userId")
|
|
102
103
|
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
|
|
103
104
|
tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
|
|
104
105
|
additional_metadata: Optional[Dict] = Field(
|