deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/config/logging.py +33 -0
  4. deepeval/config/settings.py +167 -12
  5. deepeval/dataset/dataset.py +8 -2
  6. deepeval/evaluate/evaluate.py +8 -2
  7. deepeval/evaluate/execute.py +28 -30
  8. deepeval/evaluate/types.py +4 -1
  9. deepeval/evaluate/utils.py +46 -29
  10. deepeval/integrations/crewai/__init__.py +1 -2
  11. deepeval/integrations/crewai/handler.py +153 -81
  12. deepeval/integrations/crewai/wrapper.py +87 -0
  13. deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
  14. deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  15. deepeval/metrics/faithfulness/faithfulness.py +8 -0
  16. deepeval/metrics/g_eval/g_eval.py +26 -15
  17. deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
  18. deepeval/models/retry_policy.py +202 -11
  19. deepeval/test_run/__init__.py +2 -1
  20. deepeval/test_run/api.py +1 -0
  21. deepeval/test_run/test_run.py +85 -9
  22. deepeval/tracing/__init__.py +2 -0
  23. deepeval/tracing/otel/exporter.py +0 -6
  24. deepeval/tracing/otel/test_exporter.py +35 -0
  25. deepeval/tracing/otel/utils.py +57 -7
  26. deepeval/tracing/trace_context.py +14 -0
  27. deepeval/tracing/trace_test_manager.py +19 -0
  28. deepeval/tracing/tracing.py +7 -6
  29. deepeval/tracing/utils.py +2 -86
  30. deepeval/utils.py +149 -1
  31. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
  32. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
  33. deepeval/integrations/crewai/agent.py +0 -98
  34. deepeval/integrations/crewai/patch.py +0 -41
  35. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
  36. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
  37. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py CHANGED
@@ -1,24 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  import os
2
- import warnings
3
5
  import re
6
+ import warnings
4
7
 
5
- # load environment variables before other imports
8
+ # IMPORTANT: load environment variables before other imports
6
9
  from deepeval.config.settings import autoload_dotenv, get_settings
7
10
 
11
+ logging.getLogger("deepeval").addHandler(logging.NullHandler())
8
12
  autoload_dotenv()
9
13
 
10
- from ._version import __version__
11
- from deepeval.evaluate import evaluate, assert_test
12
- from deepeval.evaluate.compare import compare
13
- from deepeval.test_run import on_test_run_end, log_hyperparameters
14
- from deepeval.utils import login
15
- from deepeval.telemetry import *
14
+
15
+ def _expose_public_api() -> None:
16
+ # All other imports must happen after env is loaded
17
+ # Do not do this at module level or ruff will complain with E402
18
+ global __version__, evaluate, assert_test, compare
19
+ global on_test_run_end, log_hyperparameters, login, telemetry
20
+
21
+ from ._version import __version__ as _version
22
+ from deepeval.evaluate import (
23
+ evaluate as _evaluate,
24
+ assert_test as _assert_test,
25
+ )
26
+ from deepeval.evaluate.compare import compare as _compare
27
+ from deepeval.test_run import (
28
+ on_test_run_end as _on_end,
29
+ log_hyperparameters as _log_hparams,
30
+ )
31
+ from deepeval.utils import login as _login
32
+ import deepeval.telemetry as _telemetry
33
+
34
+ __version__ = _version
35
+ evaluate = _evaluate
36
+ assert_test = _assert_test
37
+ compare = _compare
38
+ on_test_run_end = _on_end
39
+ log_hyperparameters = _log_hparams
40
+ login = _login
41
+ telemetry = _telemetry
42
+
43
+
44
+ _expose_public_api()
16
45
 
17
46
 
18
47
  settings = get_settings()
48
+
19
49
  if not settings.DEEPEVAL_GRPC_LOGGING:
20
- os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
21
- os.environ.setdefault("GRPC_TRACE", "")
50
+ if os.getenv("GRPC_VERBOSITY") is None:
51
+ os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
52
+ if os.getenv("GRPC_TRACE") is None:
53
+ os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
22
54
 
23
55
 
24
56
  __all__ = [
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.4"
1
+ __version__: str = "3.6.6"
@@ -0,0 +1,33 @@
1
+ """
2
+ Minimal logging configuration helpers for DeepEval.
3
+
4
+ This module centralizes how the library-level logger ("deepeval") is configured. We
5
+ intentionally keep configuration lightweight so application code retains control
6
+ over handlers and formatters.
7
+ """
8
+
9
+ import logging
10
+ from deepeval.config.settings import get_settings
11
+
12
+
13
+ def apply_deepeval_log_level() -> None:
14
+ """
15
+ Apply DeepEval's current log level to the package logger.
16
+
17
+ This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
18
+ and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
19
+ unset (None), INFO is used as a default. The logger's `propagate` flag is set
20
+ to True so records bubble up to the application's handlers. DeepEval does not
21
+ install its own handlers here (a NullHandler is attached in `__init__.py`).
22
+
23
+ The function is idempotent and safe to call multiple times. It is invoked
24
+ automatically when settings are first constructed and whenever `LOG_LEVEL`
25
+ is changed via `settings.edit`.
26
+ """
27
+ settings = get_settings()
28
+ log_level = settings.LOG_LEVEL
29
+ logging.getLogger("deepeval").setLevel(
30
+ log_level if log_level is not None else logging.INFO
31
+ )
32
+ # ensure we bubble up to app handlers
33
+ logging.getLogger("deepeval").propagate = True
@@ -10,12 +10,20 @@ Central config for DeepEval.
10
10
  """
11
11
 
12
12
  import logging
13
+ import math
13
14
  import os
14
15
  import re
15
16
 
16
17
  from dotenv import dotenv_values
17
18
  from pathlib import Path
18
- from pydantic import AnyUrl, SecretStr, field_validator, confloat
19
+ from pydantic import (
20
+ AnyUrl,
21
+ computed_field,
22
+ confloat,
23
+ conint,
24
+ field_validator,
25
+ SecretStr,
26
+ )
19
27
  from pydantic_settings import BaseSettings, SettingsConfigDict
20
28
  from typing import Any, Dict, List, Optional, NamedTuple
21
29
 
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
155
163
  #
156
164
 
157
165
  APP_ENV: str = "dev"
158
- LOG_LEVEL: str = "info"
166
+ LOG_LEVEL: Optional[int] = None
159
167
  PYTHONPATH: str = "."
160
168
  CONFIDENT_REGION: Optional[str] = None
161
169
  CONFIDENT_OPEN_BROWSER: Optional[bool] = True
@@ -180,6 +188,19 @@ class Settings(BaseSettings):
180
188
  # into this directory. The directory will be created on demand.
181
189
  DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
182
190
 
191
+ # Display / Truncation
192
+ DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
193
+ DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
194
+ DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
195
+ DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
196
+
197
+ # If set, this overrides the default max_len used by deepeval/utils shorten
198
+ # falls back to DEEPEVAL_MAXLEN_LONG when None.
199
+ DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
200
+
201
+ # Optional global suffix (keeps your "..." default).
202
+ DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
203
+
183
204
  #
184
205
  # GPU and perf toggles
185
206
  #
@@ -274,9 +295,33 @@ class Settings(BaseSettings):
274
295
  #
275
296
  # Retry Policy
276
297
  #
277
- DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = None
278
- DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = None # default -> INFO
298
+ # Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
299
+ # Key concepts:
300
+ # - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
301
+ # - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
302
+ # contribution is ~ JITTER/2 per sleep.
303
+ # - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
304
+ # the retry loggers will honor it without restart.
305
+ DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
306
+ None # ["*"] to delegate all retries to SDKs
307
+ )
308
+ DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
309
+ None # default is LOG_LEVEL if set, else INFO
310
+ )
279
311
  DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
312
+ DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
313
+ 2 # attempts = first try + retries
314
+ )
315
+ DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
316
+ 1.0 # first sleep before retry, if any
317
+ )
318
+ DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
319
+ 2.0 # exponential growth factor for sleeps
320
+ )
321
+ DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
322
+ DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
323
+ 5.0 # cap for each backoff sleep
324
+ )
280
325
 
281
326
  #
282
327
  # Telemetry and Debug
@@ -303,19 +348,87 @@ class Settings(BaseSettings):
303
348
  #
304
349
  MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
305
350
  MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
351
+ # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
352
+ # This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
353
+ # If you leave this at 0/None, the computed outer budget defaults to 180s.
354
+ DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
355
+ None # per-attempt timeout. Set 0/None to disable
356
+ )
306
357
 
307
358
  #
308
359
  # Async Task Configuration
309
360
  #
310
-
311
- # Maximum time allowed for a single task to complete
312
- DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
313
- 300 # Set to float('inf') to disable timeout
314
- )
361
+ DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
362
+ DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
363
+ # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
364
+ # It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
365
+ # - OVERRIDE = None or 0 -> auto compute as:
366
+ # attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
367
+ # (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
368
+ # - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
369
+ # to permit the configured attempts/backoff.
370
+ #
371
+ # Tip:
372
+ # Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
373
+ # Leave the outer budget on auto unless you have very strict SLAs.
374
+ DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
315
375
 
316
376
  # Buffer time for gathering results from all tasks, added to the longest task duration
317
377
  # Increase if many tasks are running concurrently
318
- DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
378
+ DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
379
+
380
+ ###################
381
+ # Computed Fields #
382
+ ###################
383
+
384
+ def _calc_auto_outer_timeout(self) -> int:
385
+ """Compute outer budget from per-attempt timeout + retries/backoff.
386
+ Never reference the computed property itself here.
387
+ """
388
+ attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
389
+ timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
390
+ if timeout_seconds <= 0:
391
+ # No per-attempt timeout set -> default outer budget
392
+ return 180
393
+
394
+ sleeps = max(0, attempts - 1)
395
+ cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
396
+ cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
397
+ base = float(self.DEEPEVAL_RETRY_EXP_BASE)
398
+ jitter = float(self.DEEPEVAL_RETRY_JITTER)
399
+
400
+ backoff = 0.0
401
+ for _ in range(sleeps):
402
+ backoff += min(cap, cur)
403
+ cur *= base
404
+ backoff += sleeps * (jitter / 2.0) # expected jitter
405
+
406
+ safety_overhead = 1.0
407
+ return int(
408
+ math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
409
+ )
410
+
411
+ @computed_field
412
+ @property
413
+ def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
414
+ """If OVERRIDE is set (nonzero), return it; else return the derived budget."""
415
+ outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
416
+ if outer not in (None, 0):
417
+ # Warn if user-provided outer is likely to truncate retries
418
+ if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
419
+ min_needed = self._calc_auto_outer_timeout()
420
+ if int(outer) < min_needed:
421
+ if self.DEEPEVAL_VERBOSE_MODE:
422
+ logger.warning(
423
+ "Metric timeout (outer=%ss) is less than attempts × per-attempt "
424
+ "timeout + backoff (≈%ss). Retries may be cut short.",
425
+ int(outer),
426
+ min_needed,
427
+ )
428
+ return int(outer)
429
+
430
+ # Auto mode
431
+ return self._calc_auto_outer_timeout()
319
432
 
320
433
  ##############
321
434
  # Validators #
@@ -461,7 +574,9 @@ class Settings(BaseSettings):
461
574
  if s in SUPPORTED_PROVIDER_SLUGS:
462
575
  normalized.append(s)
463
576
  else:
464
- if cls.DEEPEVAL_VERBOSE_MODE:
577
+ if parse_bool(
578
+ os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
579
+ ):
465
580
  logger.warning("Unknown provider slug %r dropped", item)
466
581
 
467
582
  if star:
@@ -474,6 +589,7 @@ class Settings(BaseSettings):
474
589
  @field_validator(
475
590
  "DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
476
591
  "DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
592
+ "LOG_LEVEL",
477
593
  mode="before",
478
594
  )
479
595
  @classmethod
@@ -511,6 +627,10 @@ class Settings(BaseSettings):
511
627
  # Persistence support #
512
628
  #######################
513
629
  class _SettingsEditCtx:
630
+ COMPUTED_FIELDS: frozenset[str] = frozenset(
631
+ {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
632
+ )
633
+
514
634
  def __init__(
515
635
  self,
516
636
  settings: "Settings",
@@ -546,8 +666,11 @@ class Settings(BaseSettings):
546
666
  # lazy import legacy JSON store deps
547
667
  from deepeval.key_handler import KEY_FILE_HANDLER
548
668
 
669
+ model_fields = type(self._s).model_fields
670
+ # Exclude computed fields from persistence
671
+
549
672
  # compute diff of changed fields
550
- after = {k: getattr(self._s, k) for k in type(self._s).model_fields}
673
+ after = {k: getattr(self._s, k) for k in model_fields}
551
674
 
552
675
  before_norm = {
553
676
  k: _normalize_for_env(v) for k, v in self._before.items()
@@ -557,12 +680,21 @@ class Settings(BaseSettings):
557
680
  changed_keys = {
558
681
  k for k in after_norm if after_norm[k] != before_norm.get(k)
559
682
  }
683
+ changed_keys -= self.COMPUTED_FIELDS
684
+
560
685
  if not changed_keys:
561
686
  self.result = PersistResult(False, None, {})
562
687
  return False
563
688
 
564
689
  updates = {k: after[k] for k in changed_keys}
565
690
 
691
+ if "LOG_LEVEL" in updates:
692
+ from deepeval.config.logging import (
693
+ apply_deepeval_log_level,
694
+ )
695
+
696
+ apply_deepeval_log_level()
697
+
566
698
  #
567
699
  # .deepeval JSON support
568
700
  #
@@ -668,4 +800,27 @@ def get_settings() -> Settings:
668
800
  global _settings_singleton
669
801
  if _settings_singleton is None:
670
802
  _settings_singleton = Settings()
803
+ from deepeval.config.logging import apply_deepeval_log_level
804
+
805
+ apply_deepeval_log_level()
671
806
  return _settings_singleton
807
+
808
+
809
+ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
810
+ """
811
+ Drop the cached Settings singleton and rebuild it from the current process
812
+ environment.
813
+
814
+ Args:
815
+ reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
816
+ which merges .env values into os.environ (never overwriting
817
+ existing process env vars).
818
+
819
+ Returns:
820
+ The fresh Settings instance.
821
+ """
822
+ global _settings_singleton
823
+ if reload_dotenv:
824
+ autoload_dotenv()
825
+ _settings_singleton = None
826
+ return get_settings()
@@ -1266,11 +1266,17 @@ class EvaluationDataset:
1266
1266
  detach(ctx_token)
1267
1267
 
1268
1268
  else:
1269
- confident_link = global_test_run_manager.wrap_up_test_run(
1269
+ res = global_test_run_manager.wrap_up_test_run(
1270
1270
  run_duration, display_table=False
1271
1271
  )
1272
+ if isinstance(res, tuple):
1273
+ confident_link, test_run_id = res
1274
+ else:
1275
+ confident_link = test_run_id = None
1272
1276
  return EvaluationResult(
1273
- test_results=test_results, confident_link=confident_link
1277
+ test_results=test_results,
1278
+ confident_link=confident_link,
1279
+ test_run_id=test_run_id,
1274
1280
  )
1275
1281
 
1276
1282
  def evaluate(self, task: Task):
@@ -268,11 +268,17 @@ def evaluate(
268
268
  test_run = global_test_run_manager.get_test_run()
269
269
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
270
270
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
- confident_link = global_test_run_manager.wrap_up_test_run(
271
+ res = global_test_run_manager.wrap_up_test_run(
272
272
  run_duration, display_table=False
273
273
  )
274
+ if isinstance(res, tuple):
275
+ confident_link, test_run_id = res
276
+ else:
277
+ confident_link = test_run_id = None
274
278
  return EvaluationResult(
275
- test_results=test_results, confident_link=confident_link
279
+ test_results=test_results,
280
+ confident_link=confident_link,
281
+ test_run_id=test_run_id,
276
282
  )
277
283
  elif metric_collection:
278
284
  api = Api()
@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
45
45
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
46
  from deepeval.errors import MissingTestCaseParamsError
47
47
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import (
49
- get_or_create_event_loop,
50
- )
48
+ from deepeval.utils import get_or_create_event_loop, shorten, len_medium
51
49
  from deepeval.telemetry import capture_evaluation_run
52
50
  from deepeval.metrics import (
53
51
  BaseMetric,
@@ -93,7 +91,6 @@ from deepeval.config.settings import get_settings
93
91
 
94
92
 
95
93
  logger = logging.getLogger(__name__)
96
- settings = get_settings()
97
94
 
98
95
 
99
96
  async def _snapshot_tasks():
@@ -102,6 +99,18 @@ async def _snapshot_tasks():
102
99
  return {t for t in asyncio.all_tasks() if t is not cur}
103
100
 
104
101
 
102
+ def _per_task_timeout() -> float:
103
+ return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
104
+
105
+
106
+ def _gather_timeout() -> float:
107
+ s = get_settings()
108
+ return (
109
+ s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
110
+ + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
111
+ )
112
+
113
+
105
114
  ###########################################
106
115
  ### E2E Evals #############################
107
116
  ###########################################
@@ -840,7 +849,7 @@ def execute_agentic_test_cases(
840
849
  loop.run_until_complete(
841
850
  asyncio.wait_for(
842
851
  coro,
843
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
852
+ timeout=_per_task_timeout(),
844
853
  )
845
854
  )
846
855
  else:
@@ -1198,7 +1207,7 @@ async def _a_execute_agentic_test_case(
1198
1207
  if asyncio.iscoroutinefunction(observed_callback):
1199
1208
  await asyncio.wait_for(
1200
1209
  observed_callback(golden.input),
1201
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
1210
+ timeout=_per_task_timeout(),
1202
1211
  )
1203
1212
  else:
1204
1213
  observed_callback(golden.input)
@@ -1755,11 +1764,6 @@ def a_execute_agentic_test_cases_from_loop(
1755
1764
  _is_assert_test: bool = False,
1756
1765
  ) -> Iterator[TestResult]:
1757
1766
 
1758
- GATHER_TIMEOUT_SECONDS = (
1759
- settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1760
- + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
1761
- )
1762
-
1763
1767
  semaphore = asyncio.Semaphore(async_config.max_concurrent)
1764
1768
  original_create_task = asyncio.create_task
1765
1769
 
@@ -1774,7 +1778,7 @@ def a_execute_agentic_test_cases_from_loop(
1774
1778
  async def execute_callback_with_semaphore(coroutine: Awaitable):
1775
1779
  async with semaphore:
1776
1780
  return await asyncio.wait_for(
1777
- coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1781
+ coroutine, timeout=_per_task_timeout()
1778
1782
  )
1779
1783
 
1780
1784
  def evaluate_test_cases(
@@ -1802,14 +1806,11 @@ def a_execute_agentic_test_cases_from_loop(
1802
1806
  )
1803
1807
 
1804
1808
  # record metadata for debugging
1805
- MAX_META_INPUT_LENGTH = 120
1806
1809
  started = time.perf_counter()
1807
- short_input = current_golden_ctx["input"]
1808
- if (
1809
- isinstance(short_input, str)
1810
- and len(short_input) > MAX_META_INPUT_LENGTH
1811
- ):
1812
- short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
1810
+ short_input = current_golden_ctx.get("input")
1811
+ if isinstance(short_input, str):
1812
+ short_input = shorten(short_input, len_medium())
1813
+
1813
1814
  task_meta[task] = {
1814
1815
  "golden_index": current_golden_ctx["index"],
1815
1816
  "golden_name": current_golden_ctx["name"],
@@ -1819,7 +1820,7 @@ def a_execute_agentic_test_cases_from_loop(
1819
1820
  }
1820
1821
 
1821
1822
  def on_task_done(t: asyncio.Task):
1822
- if settings.DEEPEVAL_DEBUG_ASYNC:
1823
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1823
1824
  # Using info level here to make it easy to spot these logs.
1824
1825
  # We are gated by DEEPEVAL_DEBUG_ASYNC
1825
1826
  meta = task_meta.get(t, {})
@@ -1893,7 +1894,7 @@ def a_execute_agentic_test_cases_from_loop(
1893
1894
  loop.run_until_complete(
1894
1895
  asyncio.wait_for(
1895
1896
  asyncio.gather(*created_tasks, return_exceptions=True),
1896
- timeout=GATHER_TIMEOUT_SECONDS,
1897
+ timeout=_gather_timeout(),
1897
1898
  )
1898
1899
  )
1899
1900
  except asyncio.TimeoutError:
@@ -1908,16 +1909,13 @@ def a_execute_agentic_test_cases_from_loop(
1908
1909
  elapsed_time = time.perf_counter() - start_time
1909
1910
 
1910
1911
  # Determine if it was a per task or gather timeout based on task's elapsed time
1911
- if (
1912
- elapsed_time
1913
- >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1914
- ):
1912
+ if elapsed_time >= _per_task_timeout():
1915
1913
  timeout_type = "per-task"
1916
1914
  else:
1917
1915
  timeout_type = "gather"
1918
1916
 
1919
1917
  logger.warning(
1920
- f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
1918
+ f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
1921
1919
  f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
1922
1920
  f"To give tasks more time, consider increasing "
1923
1921
  f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
@@ -1931,7 +1929,7 @@ def a_execute_agentic_test_cases_from_loop(
1931
1929
  elapsed_time,
1932
1930
  meta,
1933
1931
  )
1934
- if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
1932
+ if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
1935
1933
  frames = t.get_stack(limit=6)
1936
1934
  if frames:
1937
1935
  logger.info(" stack:")
@@ -1970,9 +1968,9 @@ def a_execute_agentic_test_cases_from_loop(
1970
1968
  if not leftovers:
1971
1969
  return
1972
1970
 
1973
- if settings.DEEPEVAL_DEBUG_ASYNC:
1971
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1974
1972
  logger.warning(
1975
- "[deepeval] %d stray task(s) not tracked; cancelling",
1973
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
1976
1974
  len(leftovers),
1977
1975
  )
1978
1976
  for t in leftovers:
@@ -1990,7 +1988,7 @@ def a_execute_agentic_test_cases_from_loop(
1990
1988
  )
1991
1989
  except RuntimeError:
1992
1990
  # If the loop is closing here, just continue
1993
- if settings.DEEPEVAL_DEBUG_ASYNC:
1991
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1994
1992
  logger.warning(
1995
1993
  "[deepeval] failed to drain stray tasks because loop is closing"
1996
1994
  )
@@ -1,7 +1,8 @@
1
1
  from typing import Optional, List, Union, Dict
2
2
  from dataclasses import dataclass
3
3
  from pydantic import BaseModel
4
- from deepeval.test_run import MetricData
4
+
5
+ from deepeval.test_run.api import MetricData, TurnApi
5
6
  from deepeval.test_case import MLLMImage
6
7
 
7
8
 
@@ -19,9 +20,11 @@ class TestResult:
19
20
  expected_output: Optional[str] = None
20
21
  context: Optional[List[str]] = None
21
22
  retrieval_context: Optional[List[str]] = None
23
+ turns: Optional[List[TurnApi]] = None
22
24
  additional_metadata: Optional[Dict] = None
23
25
 
24
26
 
25
27
  class EvaluationResult(BaseModel):
26
28
  test_results: List[TestResult]
27
29
  confident_link: Optional[str]
30
+ test_run_id: Optional[str]