deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/config/logging.py +33 -0
  4. deepeval/config/settings.py +167 -12
  5. deepeval/dataset/dataset.py +8 -2
  6. deepeval/evaluate/evaluate.py +8 -2
  7. deepeval/evaluate/execute.py +28 -30
  8. deepeval/evaluate/types.py +4 -1
  9. deepeval/evaluate/utils.py +46 -29
  10. deepeval/integrations/crewai/__init__.py +1 -2
  11. deepeval/integrations/crewai/handler.py +153 -81
  12. deepeval/integrations/crewai/wrapper.py +87 -0
  13. deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
  14. deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  15. deepeval/metrics/faithfulness/faithfulness.py +8 -0
  16. deepeval/metrics/g_eval/g_eval.py +26 -15
  17. deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
  18. deepeval/models/retry_policy.py +202 -11
  19. deepeval/test_run/__init__.py +2 -1
  20. deepeval/test_run/api.py +1 -0
  21. deepeval/test_run/test_run.py +85 -9
  22. deepeval/tracing/__init__.py +2 -0
  23. deepeval/tracing/otel/exporter.py +0 -6
  24. deepeval/tracing/otel/test_exporter.py +35 -0
  25. deepeval/tracing/otel/utils.py +57 -7
  26. deepeval/tracing/trace_context.py +14 -0
  27. deepeval/tracing/trace_test_manager.py +19 -0
  28. deepeval/tracing/tracing.py +7 -6
  29. deepeval/tracing/utils.py +2 -86
  30. deepeval/utils.py +149 -1
  31. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
  32. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
  33. deepeval/integrations/crewai/agent.py +0 -98
  34. deepeval/integrations/crewai/patch.py +0 -41
  35. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
  36. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
  37. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
+ import asyncio
4
+
3
5
  from typing import Optional, List, Tuple, Union, Type
4
6
  from deepeval.metrics import BaseMetric
5
7
  from deepeval.test_case import (
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
16
18
  )
17
19
  from deepeval.models import DeepEvalBaseLLM
18
20
  from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.g_eval.schema import *
21
+ from deepeval.metrics.g_eval import schema as gschema
20
22
  from deepeval.metrics.g_eval.utils import (
21
23
  Rubric,
22
24
  construct_g_eval_params_string,
@@ -29,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
29
31
  number_evaluation_steps,
30
32
  get_score_range,
31
33
  )
34
+ from deepeval.config.settings import get_settings
32
35
 
33
36
 
34
37
  class GEval(BaseMetric):
@@ -81,12 +84,16 @@ class GEval(BaseMetric):
81
84
  ):
82
85
  if self.async_mode:
83
86
  loop = get_or_create_event_loop()
87
+ coro = self.a_measure(
88
+ test_case,
89
+ _show_indicator=False,
90
+ _in_component=_in_component,
91
+ _additional_context=_additional_context,
92
+ )
84
93
  loop.run_until_complete(
85
- self.a_measure(
86
- test_case,
87
- _show_indicator=False,
88
- _in_component=_in_component,
89
- _additional_context=_additional_context,
94
+ asyncio.wait_for(
95
+ coro,
96
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
90
97
  )
91
98
  )
92
99
  else:
@@ -177,7 +184,9 @@ class GEval(BaseMetric):
177
184
  return data["steps"]
178
185
  else:
179
186
  try:
180
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
187
+ res: gschema.Steps = await self.model.a_generate(
188
+ prompt, schema=gschema.Steps
189
+ )
181
190
  return res.steps
182
191
  except TypeError:
183
192
  res = await self.model.a_generate(prompt)
@@ -201,7 +210,9 @@ class GEval(BaseMetric):
201
210
  return data["steps"]
202
211
  else:
203
212
  try:
204
- res: Steps = self.model.generate(prompt, schema=Steps)
213
+ res: gschema.Steps = self.model.generate(
214
+ prompt, schema=gschema.Steps
215
+ )
205
216
  return res.steps
206
217
  except TypeError:
207
218
  res = self.model.generate(prompt)
@@ -264,7 +275,7 @@ class GEval(BaseMetric):
264
275
  score, res
265
276
  )
266
277
  return weighted_summed_score, reason
267
- except:
278
+ except (KeyError, AttributeError, TypeError, ValueError):
268
279
  return score, reason
269
280
  except (
270
281
  AttributeError
@@ -276,8 +287,8 @@ class GEval(BaseMetric):
276
287
  return data["score"], data["reason"]
277
288
  else:
278
289
  try:
279
- res: ReasonScore = await self.model.a_generate(
280
- prompt, schema=ReasonScore
290
+ res: gschema.ReasonScore = await self.model.a_generate(
291
+ prompt, schema=gschema.ReasonScore
281
292
  )
282
293
  return res.score, res.reason
283
294
  except TypeError:
@@ -338,7 +349,7 @@ class GEval(BaseMetric):
338
349
  score, res
339
350
  )
340
351
  return weighted_summed_score, reason
341
- except:
352
+ except (KeyError, AttributeError, TypeError, ValueError):
342
353
  return score, reason
343
354
  except AttributeError:
344
355
  # This catches the case where a_generate_raw_response doesn't exist.
@@ -349,8 +360,8 @@ class GEval(BaseMetric):
349
360
  return data["score"], data["reason"]
350
361
  else:
351
362
  try:
352
- res: ReasonScore = self.model.generate(
353
- prompt, schema=ReasonScore
363
+ res: gschema.ReasonScore = self.model.generate(
364
+ prompt, schema=gschema.ReasonScore
354
365
  )
355
366
  return res.score, res.reason
356
367
  except TypeError:
@@ -364,7 +375,7 @@ class GEval(BaseMetric):
364
375
  else:
365
376
  try:
366
377
  self.success = self.score >= self.threshold
367
- except:
378
+ except TypeError:
368
379
  self.success = False
369
380
  return self.success
370
381
 
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+
1
3
  from typing import Optional, List, Union
2
4
 
3
5
  from deepeval.utils import get_or_create_event_loop, prettify_list
@@ -15,7 +17,8 @@ from deepeval.metrics import BaseMetric
15
17
  from deepeval.models import DeepEvalBaseLLM
16
18
  from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
17
19
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.prompt_alignment.schema import *
20
+ from deepeval.metrics.prompt_alignment import schema as paschema
21
+ from deepeval.config.settings import get_settings
19
22
 
20
23
 
21
24
  class PromptAlignmentMetric(BaseMetric):
@@ -62,15 +65,19 @@ class PromptAlignmentMetric(BaseMetric):
62
65
  ):
63
66
  if self.async_mode:
64
67
  loop = get_or_create_event_loop()
68
+ coro = self.a_measure(
69
+ test_case,
70
+ _show_indicator=False,
71
+ _in_component=_in_component,
72
+ )
65
73
  loop.run_until_complete(
66
- self.a_measure(
67
- test_case,
68
- _show_indicator=False,
69
- _in_component=_in_component,
74
+ asyncio.wait_for(
75
+ coro,
76
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
70
77
  )
71
78
  )
72
79
  else:
73
- self.verdicts: Verdicts = self._generate_verdicts(
80
+ self.verdicts: paschema.Verdicts = self._generate_verdicts(
74
81
  test_case.input, test_case.actual_output
75
82
  )
76
83
  self.score = self._calculate_score()
@@ -105,7 +112,7 @@ class PromptAlignmentMetric(BaseMetric):
105
112
  _show_indicator=_show_indicator,
106
113
  _in_component=_in_component,
107
114
  ):
108
- self.verdicts: Verdicts = await self._a_generate_verdicts(
115
+ self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
109
116
  test_case.input, test_case.actual_output
110
117
  )
111
118
  self.score = self._calculate_score()
@@ -141,14 +148,17 @@ class PromptAlignmentMetric(BaseMetric):
141
148
  )
142
149
  if self.using_native_model:
143
150
  res, cost = await self.model.a_generate(
144
- prompt, schema=PromptAlignmentScoreReason
151
+ prompt, schema=paschema.PromptAlignmentScoreReason
145
152
  )
146
153
  self.evaluation_cost += cost
147
154
  return res.reason
148
155
  else:
149
156
  try:
150
- res: PromptAlignmentScoreReason = await self.model.a_generate(
151
- prompt=prompt, schema=PromptAlignmentScoreReason
157
+ res: paschema.PromptAlignmentScoreReason = (
158
+ await self.model.a_generate(
159
+ prompt=prompt,
160
+ schema=paschema.PromptAlignmentScoreReason,
161
+ )
152
162
  )
153
163
  return res.reason
154
164
  except TypeError:
@@ -173,14 +183,14 @@ class PromptAlignmentMetric(BaseMetric):
173
183
  )
174
184
  if self.using_native_model:
175
185
  res, cost = self.model.generate(
176
- prompt, schema=PromptAlignmentScoreReason
186
+ prompt, schema=paschema.PromptAlignmentScoreReason
177
187
  )
178
188
  self.evaluation_cost += cost
179
189
  return res.reason
180
190
  else:
181
191
  try:
182
- res: PromptAlignmentScoreReason = self.model.generate(
183
- prompt=prompt, schema=PromptAlignmentScoreReason
192
+ res: paschema.PromptAlignmentScoreReason = self.model.generate(
193
+ prompt=prompt, schema=paschema.PromptAlignmentScoreReason
184
194
  )
185
195
  return res.reason
186
196
  except TypeError:
@@ -190,48 +200,56 @@ class PromptAlignmentMetric(BaseMetric):
190
200
 
191
201
  async def _a_generate_verdicts(
192
202
  self, input: str, actual_output: str
193
- ) -> Verdicts:
203
+ ) -> paschema.Verdicts:
194
204
  prompt = PromptAlignmentTemplate.generate_verdicts(
195
205
  prompt_instructions=self.prompt_instructions,
196
206
  input=input,
197
207
  actual_output=actual_output,
198
208
  )
199
209
  if self.using_native_model:
200
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
210
+ res, cost = await self.model.a_generate(
211
+ prompt, schema=paschema.Verdicts
212
+ )
201
213
  self.evaluation_cost += cost
202
214
  return [item for item in res.verdicts]
203
215
  else:
204
216
  try:
205
- res: Verdicts = await self.model.a_generate(
206
- prompt, schema=Verdicts
217
+ res: paschema.Verdicts = await self.model.a_generate(
218
+ prompt, schema=paschema.Verdicts
207
219
  )
208
220
  return [item for item in res.verdicts]
209
221
  except TypeError:
210
222
  res = await self.model.a_generate(prompt)
211
223
  data = trimAndLoadJson(res, self)
212
224
  return [
213
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
225
+ paschema.PromptAlignmentVerdict(**item)
226
+ for item in data["verdicts"]
214
227
  ]
215
228
 
216
- def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts:
229
+ def _generate_verdicts(
230
+ self, input: str, actual_output: str
231
+ ) -> paschema.Verdicts:
217
232
  prompt = PromptAlignmentTemplate.generate_verdicts(
218
233
  prompt_instructions=self.prompt_instructions,
219
234
  input=input,
220
235
  actual_output=actual_output,
221
236
  )
222
237
  if self.using_native_model:
223
- res, cost = self.model.generate(prompt, schema=Verdicts)
238
+ res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
224
239
  self.evaluation_cost += cost
225
240
  return [item for item in res.verdicts]
226
241
  else:
227
242
  try:
228
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
243
+ res: paschema.Verdicts = self.model.generate(
244
+ prompt, schema=paschema.Verdicts
245
+ )
229
246
  return [item for item in res.verdicts]
230
247
  except TypeError:
231
248
  res = self.model.generate(prompt)
232
249
  data = trimAndLoadJson(res, self)
233
250
  return [
234
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
251
+ paschema.PromptAlignmentVerdict(**item)
252
+ for item in data["verdicts"]
235
253
  ]
236
254
 
237
255
  def _calculate_score(self):
@@ -253,7 +271,7 @@ class PromptAlignmentMetric(BaseMetric):
253
271
  else:
254
272
  try:
255
273
  self.success = self.score >= self.threshold
256
- except:
274
+ except TypeError:
257
275
  self.success = False
258
276
  return self.success
259
277
 
@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
33
33
 
34
34
  from __future__ import annotations
35
35
 
36
+ import asyncio
37
+ import inspect
38
+ import itertools
39
+ import functools
40
+ import threading
36
41
  import logging
37
42
 
38
- from deepeval.utils import read_env_int, read_env_float
39
43
  from dataclasses import dataclass, field
40
44
  from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
41
45
  from collections.abc import Mapping as ABCMapping
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
58
62
 
59
63
  logger = logging.getLogger(__name__)
60
64
  Provider = Union[str, PS]
65
+ _MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
66
+ _TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
67
+ _WORKER_ID = itertools.count(1)
61
68
 
62
69
  # --------------------------
63
70
  # Policy description
@@ -184,6 +191,12 @@ def extract_error_code(
184
191
  # Predicate factory
185
192
  # --------------------------
186
193
 
194
+ _BUILTIN_TIMEOUT_EXCS = (
195
+ (TimeoutError,)
196
+ if asyncio.TimeoutError is TimeoutError
197
+ else (TimeoutError, asyncio.TimeoutError)
198
+ )
199
+
187
200
 
188
201
  def make_is_transient(
189
202
  policy: ErrorPolicy,
@@ -213,6 +226,9 @@ def make_is_transient(
213
226
  )
214
227
 
215
228
  def _pred(e: Exception) -> bool:
229
+ if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
230
+ return True
231
+
216
232
  if isinstance(e, policy.auth_excs):
217
233
  return False
218
234
 
@@ -245,18 +261,23 @@ def make_is_transient(
245
261
 
246
262
  class StopFromEnv(stop_base):
247
263
  def __call__(self, retry_state):
248
- attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
264
+ settings = get_settings()
265
+ attempts = (
266
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
267
+ ) # TODO: add constraints in settings
249
268
  return stop_after_attempt(attempts)(retry_state)
250
269
 
251
270
 
252
271
  class WaitFromEnv(wait_base):
253
272
  def __call__(self, retry_state):
254
- initial = read_env_float(
255
- "DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
256
- )
257
- exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
258
- jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
259
- cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
273
+ settings = get_settings()
274
+ initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
275
+ exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
276
+ jitter = settings.DEEPEVAL_RETRY_JITTER
277
+ cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
278
+
279
+ if cap == 0: # <- 0 means no backoff sleeps or jitter
280
+ return 0
260
281
  return wait_exponential_jitter(
261
282
  initial=initial, exp_base=exp_base, jitter=jitter, max=cap
262
283
  )(retry_state)
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
324
345
 
325
346
  def _retry_log_levels():
326
347
  s = get_settings()
348
+ base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
327
349
  before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
328
350
  after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
329
351
  return (
330
- before_level if before_level is not None else logging.INFO,
352
+ before_level if before_level is not None else base_level,
331
353
  after_level if after_level is not None else logging.ERROR,
332
354
  )
333
355
 
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
394
416
  return _after
395
417
 
396
418
 
419
+ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
420
+ settings = get_settings()
421
+ if logger.isEnabledFor(logging.DEBUG):
422
+ logger.debug(
423
+ "retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
424
+ timeout_seconds,
425
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
426
+ settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
427
+ )
428
+ msg = (
429
+ f"call timed out after {timeout_seconds:g}s (per attempt). "
430
+ "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
431
+ )
432
+ return TimeoutError(msg)
433
+
434
+
435
+ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
436
+ """
437
+ Run a synchronous callable with a soft timeout enforced by a helper thread,
438
+ with a global cap on concurrent timeout-workers.
439
+
440
+ How it works
441
+ ------------
442
+ - A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
443
+ gates creation of timeout worker threads. If no permit is available, this call
444
+ blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
445
+ > 0 and acquisition takes longer than that, a warning is logged before continuing
446
+ to wait.
447
+ - Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
448
+ - We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
449
+ `TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
450
+ - If the worker finishes in time, we return its result or re-raise its exception
451
+ (with original traceback).
452
+
453
+ Cancellation semantics
454
+ ----------------------
455
+ This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
456
+ are rare this is fine. If timeouts are common, consider moving to:
457
+ - a shared ThreadPoolExecutor (caps threads and amortizes creation), or
458
+ - worker process (supports killing in-flight processes)
459
+
460
+ Concurrency control & logging
461
+ -----------------------------
462
+ - Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
463
+ - If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
464
+ warning and then block until a slot is available.
465
+ - On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
466
+ thread sample to help diagnose pressure.
467
+
468
+ Args:
469
+ func: Synchronous callable to execute.
470
+ timeout_seconds: Float seconds for the soft timeout (0/None disables).
471
+ *args, **kwargs: Passed through to `func`.
472
+
473
+ Returns:
474
+ Whatever `func` returns.
475
+
476
+ Raises:
477
+ TimeoutError: If `timeout_seconds` elapse before completion.
478
+ BaseException: If `func` raises, the same exception is re-raised with its
479
+ original traceback.
480
+ """
481
+ if not timeout_seconds or timeout_seconds <= 0:
482
+ return func(*args, **kwargs)
483
+
484
+ # try to respect the global cap on concurrent timeout workers
485
+ warn_after = float(
486
+ get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
487
+ )
488
+ if warn_after > 0:
489
+ acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
490
+ if not acquired:
491
+ logger.warning(
492
+ "timeout thread limit reached (%d); waiting for a slot...",
493
+ _MAX_TIMEOUT_THREADS,
494
+ )
495
+ _TIMEOUT_SEMA.acquire()
496
+ else:
497
+ _TIMEOUT_SEMA.acquire()
498
+
499
+ done = threading.Event()
500
+ result = {"value": None, "exc": None}
501
+
502
+ def target():
503
+ try:
504
+ result["value"] = func(*args, **kwargs)
505
+ except BaseException as e:
506
+ result["exc"] = e
507
+ finally:
508
+ done.set()
509
+ _TIMEOUT_SEMA.release()
510
+
511
+ t = threading.Thread(
512
+ target=target,
513
+ daemon=True,
514
+ name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
515
+ )
516
+
517
+ try:
518
+ t.start()
519
+ except BaseException:
520
+ _TIMEOUT_SEMA.release()
521
+ raise
522
+
523
+ finished = done.wait(timeout_seconds)
524
+ if not finished:
525
+ if (
526
+ logger.isEnabledFor(logging.DEBUG)
527
+ and get_settings().DEEPEVAL_VERBOSE_MODE
528
+ ):
529
+ names = [th.name for th in threading.enumerate()[:10]]
530
+ logger.debug(
531
+ "timeout after %.3fs (active_threads=%d, sample=%s)",
532
+ timeout_seconds,
533
+ threading.active_count(),
534
+ names,
535
+ )
536
+ raise _make_timeout_error(timeout_seconds)
537
+
538
+ # Completed within time: return or raise
539
+ if result["exc"] is not None:
540
+ exc = result["exc"]
541
+ raise exc.with_traceback(getattr(exc, "__traceback__", None))
542
+ return result["value"]
543
+
544
+
397
545
  def create_retry_decorator(provider: Provider):
398
546
  """
399
547
  Build a Tenacity @retry decorator wired to our dynamic retry policy
400
548
  for the given provider slug.
401
549
  """
402
550
  slug = slugify(provider)
403
-
404
- return retry(
551
+ base_retry = retry(
405
552
  wait=dynamic_wait(),
406
553
  stop=dynamic_stop(),
407
554
  retry=dynamic_retry(slug),
408
555
  before_sleep=make_before_sleep_log(slug),
409
556
  after=make_after_log(slug),
557
+ reraise=False,
410
558
  )
411
559
 
560
+ def _decorator(func):
561
+ if inspect.iscoroutinefunction(func):
562
+
563
+ @functools.wraps(func)
564
+ async def attempt(*args, **kwargs):
565
+ timeout_seconds = (
566
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
567
+ )
568
+ coro = func(*args, **kwargs)
569
+ if timeout_seconds > 0:
570
+ try:
571
+ return await asyncio.wait_for(coro, timeout_seconds)
572
+ except asyncio.TimeoutError as e:
573
+ if (
574
+ logger.isEnabledFor(logging.DEBUG)
575
+ and get_settings().DEEPEVAL_VERBOSE_MODE is True
576
+ ):
577
+ logger.debug(
578
+ "async timeout after %.3fs (active_threads=%d, tasks=%d)",
579
+ timeout_seconds,
580
+ threading.active_count(),
581
+ len(asyncio.all_tasks()),
582
+ )
583
+ raise _make_timeout_error(timeout_seconds) from e
584
+ return await coro
585
+
586
+ return base_retry(attempt)
587
+
588
+ @functools.wraps(func)
589
+ def attempt(*args, **kwargs):
590
+ timeout_seconds = (
591
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
592
+ )
593
+ if timeout_seconds > 0:
594
+ return _run_sync_with_timeout(
595
+ func, timeout_seconds, *args, **kwargs
596
+ )
597
+ return func(*args, **kwargs)
598
+
599
+ return base_retry(attempt)
600
+
601
+ return _decorator
602
+
412
603
 
413
604
  def _httpx_net_excs() -> tuple[type, ...]:
414
605
  try:
@@ -11,7 +11,7 @@ from .test_run import (
11
11
  )
12
12
 
13
13
  from .hooks import on_test_run_end, invoke_test_run_end_hook
14
- from .api import MetricData
14
+ from .api import MetricData, TurnApi
15
15
  from .hyperparameters import log_hyperparameters
16
16
 
17
17
 
@@ -28,5 +28,6 @@ __all__ = [
28
28
  "on_test_run_end",
29
29
  "invoke_test_run_end_hook",
30
30
  "MetricData",
31
+ "TurnApi",
31
32
  "log_hyperparameters",
32
33
  ]
deepeval/test_run/api.py CHANGED
@@ -99,6 +99,7 @@ class TurnApi(BaseModel):
99
99
  role: str
100
100
  content: str
101
101
  order: int
102
+ user_id: Optional[str] = Field(None, alias="userId")
102
103
  retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
103
104
  tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
104
105
  additional_metadata: Optional[Dict] = Field(