deepeval 3.6.5__py3-none-any.whl → 3.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/__init__.py CHANGED
@@ -1,24 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  import os
2
- import warnings
3
5
  import re
6
+ import warnings
4
7
 
5
- # load environment variables before other imports
8
+ # IMPORTANT: load environment variables before other imports
6
9
  from deepeval.config.settings import autoload_dotenv, get_settings
7
10
 
11
+ logging.getLogger("deepeval").addHandler(logging.NullHandler())
8
12
  autoload_dotenv()
9
13
 
10
- from ._version import __version__
11
- from deepeval.evaluate import evaluate, assert_test
12
- from deepeval.evaluate.compare import compare
13
- from deepeval.test_run import on_test_run_end, log_hyperparameters
14
- from deepeval.utils import login
15
- from deepeval.telemetry import *
14
+
15
+ def _expose_public_api() -> None:
16
+ # All other imports must happen after env is loaded
17
+ # Do not do this at module level or ruff will complain with E402
18
+ global __version__, evaluate, assert_test, compare
19
+ global on_test_run_end, log_hyperparameters, login, telemetry
20
+
21
+ from ._version import __version__ as _version
22
+ from deepeval.evaluate import (
23
+ evaluate as _evaluate,
24
+ assert_test as _assert_test,
25
+ )
26
+ from deepeval.evaluate.compare import compare as _compare
27
+ from deepeval.test_run import (
28
+ on_test_run_end as _on_end,
29
+ log_hyperparameters as _log_hparams,
30
+ )
31
+ from deepeval.utils import login as _login
32
+ import deepeval.telemetry as _telemetry
33
+
34
+ __version__ = _version
35
+ evaluate = _evaluate
36
+ assert_test = _assert_test
37
+ compare = _compare
38
+ on_test_run_end = _on_end
39
+ log_hyperparameters = _log_hparams
40
+ login = _login
41
+ telemetry = _telemetry
42
+
43
+
44
+ _expose_public_api()
16
45
 
17
46
 
18
47
  settings = get_settings()
48
+
19
49
  if not settings.DEEPEVAL_GRPC_LOGGING:
20
- os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
21
- os.environ.setdefault("GRPC_TRACE", "")
50
+ if os.getenv("GRPC_VERBOSITY") is None:
51
+ os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
52
+ if os.getenv("GRPC_TRACE") is None:
53
+ os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
22
54
 
23
55
 
24
56
  __all__ = [
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.5"
1
+ __version__: str = "3.6.6"
@@ -0,0 +1,33 @@
1
+ """
2
+ Minimal logging configuration helpers for DeepEval.
3
+
4
+ This module centralizes how the library-level logger ("deepeval") is configured. We
5
+ intentionally keep configuration lightweight so application code retains control
6
+ over handlers and formatters.
7
+ """
8
+
9
+ import logging
10
+ from deepeval.config.settings import get_settings
11
+
12
+
13
+ def apply_deepeval_log_level() -> None:
14
+ """
15
+ Apply DeepEval's current log level to the package logger.
16
+
17
+ This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
18
+ and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
19
+ unset (None), INFO is used as a default. The logger's `propagate` flag is set
20
+ to True so records bubble up to the application's handlers. DeepEval does not
21
+ install its own handlers here (a NullHandler is attached in `__init__.py`).
22
+
23
+ The function is idempotent and safe to call multiple times. It is invoked
24
+ automatically when settings are first constructed and whenever `LOG_LEVEL`
25
+ is changed via `settings.edit`.
26
+ """
27
+ settings = get_settings()
28
+ log_level = settings.LOG_LEVEL
29
+ logging.getLogger("deepeval").setLevel(
30
+ log_level if log_level is not None else logging.INFO
31
+ )
32
+ # ensure we bubble up to app handlers
33
+ logging.getLogger("deepeval").propagate = True
@@ -10,12 +10,20 @@ Central config for DeepEval.
10
10
  """
11
11
 
12
12
  import logging
13
+ import math
13
14
  import os
14
15
  import re
15
16
 
16
17
  from dotenv import dotenv_values
17
18
  from pathlib import Path
18
- from pydantic import AnyUrl, SecretStr, field_validator, confloat
19
+ from pydantic import (
20
+ AnyUrl,
21
+ computed_field,
22
+ confloat,
23
+ conint,
24
+ field_validator,
25
+ SecretStr,
26
+ )
19
27
  from pydantic_settings import BaseSettings, SettingsConfigDict
20
28
  from typing import Any, Dict, List, Optional, NamedTuple
21
29
 
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
155
163
  #
156
164
 
157
165
  APP_ENV: str = "dev"
158
- LOG_LEVEL: str = "info"
166
+ LOG_LEVEL: Optional[int] = None
159
167
  PYTHONPATH: str = "."
160
168
  CONFIDENT_REGION: Optional[str] = None
161
169
  CONFIDENT_OPEN_BROWSER: Optional[bool] = True
@@ -287,9 +295,33 @@ class Settings(BaseSettings):
287
295
  #
288
296
  # Retry Policy
289
297
  #
290
- DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = None
291
- DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = None # default -> INFO
298
+ # Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
299
+ # Key concepts:
300
+ # - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
301
+ # - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
302
+ # contribution is ~ JITTER/2 per sleep.
303
+ # - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
304
+ # the retry loggers will honor it without restart.
305
+ DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
306
+ None # ["*"] to delegate all retries to SDKs
307
+ )
308
+ DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
309
+ None # default is LOG_LEVEL if set, else INFO
310
+ )
292
311
  DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
312
+ DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
313
+ 2 # attempts = first try + retries
314
+ )
315
+ DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
316
+ 1.0 # first sleep before retry, if any
317
+ )
318
+ DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
319
+ 2.0 # exponential growth factor for sleeps
320
+ )
321
+ DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
322
+ DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
323
+ 5.0 # cap for each backoff sleep
324
+ )
293
325
 
294
326
  #
295
327
  # Telemetry and Debug
@@ -316,19 +348,87 @@ class Settings(BaseSettings):
316
348
  #
317
349
  MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
318
350
  MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
351
+ # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
352
+ # This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
353
+ # If you leave this at 0/None, the computed outer budget defaults to 180s.
354
+ DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
355
+ None # per-attempt timeout. Set 0/None to disable
356
+ )
319
357
 
320
358
  #
321
359
  # Async Task Configuration
322
360
  #
323
-
324
- # Maximum time allowed for a single task to complete
325
- DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
326
- 300 # Set to float('inf') to disable timeout
327
- )
361
+ DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
362
+ DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
363
+ # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
364
+ # It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
365
+ # - OVERRIDE = None or 0 -> auto compute as:
366
+ # attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
367
+ # (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
368
+ # - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
369
+ # to permit the configured attempts/backoff.
370
+ #
371
+ # Tip:
372
+ # Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
373
+ # Leave the outer budget on auto unless you have very strict SLAs.
374
+ DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
328
375
 
329
376
  # Buffer time for gathering results from all tasks, added to the longest task duration
330
377
  # Increase if many tasks are running concurrently
331
- DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
378
+ DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
379
+
380
+ ###################
381
+ # Computed Fields #
382
+ ###################
383
+
384
+ def _calc_auto_outer_timeout(self) -> int:
385
+ """Compute outer budget from per-attempt timeout + retries/backoff.
386
+ Never reference the computed property itself here.
387
+ """
388
+ attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
389
+ timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
390
+ if timeout_seconds <= 0:
391
+ # No per-attempt timeout set -> default outer budget
392
+ return 180
393
+
394
+ sleeps = max(0, attempts - 1)
395
+ cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
396
+ cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
397
+ base = float(self.DEEPEVAL_RETRY_EXP_BASE)
398
+ jitter = float(self.DEEPEVAL_RETRY_JITTER)
399
+
400
+ backoff = 0.0
401
+ for _ in range(sleeps):
402
+ backoff += min(cap, cur)
403
+ cur *= base
404
+ backoff += sleeps * (jitter / 2.0) # expected jitter
405
+
406
+ safety_overhead = 1.0
407
+ return int(
408
+ math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
409
+ )
410
+
411
+ @computed_field
412
+ @property
413
+ def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
414
+ """If OVERRIDE is set (nonzero), return it; else return the derived budget."""
415
+ outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
416
+ if outer not in (None, 0):
417
+ # Warn if user-provided outer is likely to truncate retries
418
+ if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
419
+ min_needed = self._calc_auto_outer_timeout()
420
+ if int(outer) < min_needed:
421
+ if self.DEEPEVAL_VERBOSE_MODE:
422
+ logger.warning(
423
+ "Metric timeout (outer=%ss) is less than attempts × per-attempt "
424
+ "timeout + backoff (≈%ss). Retries may be cut short.",
425
+ int(outer),
426
+ min_needed,
427
+ )
428
+ return int(outer)
429
+
430
+ # Auto mode
431
+ return self._calc_auto_outer_timeout()
332
432
 
333
433
  ##############
334
434
  # Validators #
@@ -474,7 +574,9 @@ class Settings(BaseSettings):
474
574
  if s in SUPPORTED_PROVIDER_SLUGS:
475
575
  normalized.append(s)
476
576
  else:
477
- if cls.DEEPEVAL_VERBOSE_MODE:
577
+ if parse_bool(
578
+ os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
579
+ ):
478
580
  logger.warning("Unknown provider slug %r dropped", item)
479
581
 
480
582
  if star:
@@ -487,6 +589,7 @@ class Settings(BaseSettings):
487
589
  @field_validator(
488
590
  "DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
489
591
  "DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
592
+ "LOG_LEVEL",
490
593
  mode="before",
491
594
  )
492
595
  @classmethod
@@ -524,6 +627,10 @@ class Settings(BaseSettings):
524
627
  # Persistence support #
525
628
  #######################
526
629
  class _SettingsEditCtx:
630
+ COMPUTED_FIELDS: frozenset[str] = frozenset(
631
+ {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
632
+ )
633
+
527
634
  def __init__(
528
635
  self,
529
636
  settings: "Settings",
@@ -559,8 +666,11 @@ class Settings(BaseSettings):
559
666
  # lazy import legacy JSON store deps
560
667
  from deepeval.key_handler import KEY_FILE_HANDLER
561
668
 
669
+ model_fields = type(self._s).model_fields
670
+ # Exclude computed fields from persistence
671
+
562
672
  # compute diff of changed fields
563
- after = {k: getattr(self._s, k) for k in type(self._s).model_fields}
673
+ after = {k: getattr(self._s, k) for k in model_fields}
564
674
 
565
675
  before_norm = {
566
676
  k: _normalize_for_env(v) for k, v in self._before.items()
@@ -570,12 +680,21 @@ class Settings(BaseSettings):
570
680
  changed_keys = {
571
681
  k for k in after_norm if after_norm[k] != before_norm.get(k)
572
682
  }
683
+ changed_keys -= self.COMPUTED_FIELDS
684
+
573
685
  if not changed_keys:
574
686
  self.result = PersistResult(False, None, {})
575
687
  return False
576
688
 
577
689
  updates = {k: after[k] for k in changed_keys}
578
690
 
691
+ if "LOG_LEVEL" in updates:
692
+ from deepeval.config.logging import (
693
+ apply_deepeval_log_level,
694
+ )
695
+
696
+ apply_deepeval_log_level()
697
+
579
698
  #
580
699
  # .deepeval JSON support
581
700
  #
@@ -681,4 +800,27 @@ def get_settings() -> Settings:
681
800
  global _settings_singleton
682
801
  if _settings_singleton is None:
683
802
  _settings_singleton = Settings()
803
+ from deepeval.config.logging import apply_deepeval_log_level
804
+
805
+ apply_deepeval_log_level()
684
806
  return _settings_singleton
807
+
808
+
809
+ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
810
+ """
811
+ Drop the cached Settings singleton and rebuild it from the current process
812
+ environment.
813
+
814
+ Args:
815
+ reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
816
+ which merges .env values into os.environ (never overwriting
817
+ existing process env vars).
818
+
819
+ Returns:
820
+ The fresh Settings instance.
821
+ """
822
+ global _settings_singleton
823
+ if reload_dotenv:
824
+ autoload_dotenv()
825
+ _settings_singleton = None
826
+ return get_settings()
@@ -91,7 +91,6 @@ from deepeval.config.settings import get_settings
91
91
 
92
92
 
93
93
  logger = logging.getLogger(__name__)
94
- settings = get_settings()
95
94
 
96
95
 
97
96
  async def _snapshot_tasks():
@@ -100,6 +99,18 @@ async def _snapshot_tasks():
100
99
  return {t for t in asyncio.all_tasks() if t is not cur}
101
100
 
102
101
 
102
+ def _per_task_timeout() -> float:
103
+ return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
104
+
105
+
106
+ def _gather_timeout() -> float:
107
+ s = get_settings()
108
+ return (
109
+ s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
110
+ + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
111
+ )
112
+
113
+
103
114
  ###########################################
104
115
  ### E2E Evals #############################
105
116
  ###########################################
@@ -838,7 +849,7 @@ def execute_agentic_test_cases(
838
849
  loop.run_until_complete(
839
850
  asyncio.wait_for(
840
851
  coro,
841
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
852
+ timeout=_per_task_timeout(),
842
853
  )
843
854
  )
844
855
  else:
@@ -1196,7 +1207,7 @@ async def _a_execute_agentic_test_case(
1196
1207
  if asyncio.iscoroutinefunction(observed_callback):
1197
1208
  await asyncio.wait_for(
1198
1209
  observed_callback(golden.input),
1199
- timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
1210
+ timeout=_per_task_timeout(),
1200
1211
  )
1201
1212
  else:
1202
1213
  observed_callback(golden.input)
@@ -1753,11 +1764,6 @@ def a_execute_agentic_test_cases_from_loop(
1753
1764
  _is_assert_test: bool = False,
1754
1765
  ) -> Iterator[TestResult]:
1755
1766
 
1756
- GATHER_TIMEOUT_SECONDS = (
1757
- settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1758
- + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
1759
- )
1760
-
1761
1767
  semaphore = asyncio.Semaphore(async_config.max_concurrent)
1762
1768
  original_create_task = asyncio.create_task
1763
1769
 
@@ -1772,7 +1778,7 @@ def a_execute_agentic_test_cases_from_loop(
1772
1778
  async def execute_callback_with_semaphore(coroutine: Awaitable):
1773
1779
  async with semaphore:
1774
1780
  return await asyncio.wait_for(
1775
- coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1781
+ coroutine, timeout=_per_task_timeout()
1776
1782
  )
1777
1783
 
1778
1784
  def evaluate_test_cases(
@@ -1814,7 +1820,7 @@ def a_execute_agentic_test_cases_from_loop(
1814
1820
  }
1815
1821
 
1816
1822
  def on_task_done(t: asyncio.Task):
1817
- if settings.DEEPEVAL_DEBUG_ASYNC:
1823
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1818
1824
  # Using info level here to make it easy to spot these logs.
1819
1825
  # We are gated by DEEPEVAL_DEBUG_ASYNC
1820
1826
  meta = task_meta.get(t, {})
@@ -1888,7 +1894,7 @@ def a_execute_agentic_test_cases_from_loop(
1888
1894
  loop.run_until_complete(
1889
1895
  asyncio.wait_for(
1890
1896
  asyncio.gather(*created_tasks, return_exceptions=True),
1891
- timeout=GATHER_TIMEOUT_SECONDS,
1897
+ timeout=_gather_timeout(),
1892
1898
  )
1893
1899
  )
1894
1900
  except asyncio.TimeoutError:
@@ -1903,16 +1909,13 @@ def a_execute_agentic_test_cases_from_loop(
1903
1909
  elapsed_time = time.perf_counter() - start_time
1904
1910
 
1905
1911
  # Determine if it was a per task or gather timeout based on task's elapsed time
1906
- if (
1907
- elapsed_time
1908
- >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1909
- ):
1912
+ if elapsed_time >= _per_task_timeout():
1910
1913
  timeout_type = "per-task"
1911
1914
  else:
1912
1915
  timeout_type = "gather"
1913
1916
 
1914
1917
  logger.warning(
1915
- f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
1918
+ f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
1916
1919
  f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
1917
1920
  f"To give tasks more time, consider increasing "
1918
1921
  f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
@@ -1926,7 +1929,7 @@ def a_execute_agentic_test_cases_from_loop(
1926
1929
  elapsed_time,
1927
1930
  meta,
1928
1931
  )
1929
- if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
1932
+ if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
1930
1933
  frames = t.get_stack(limit=6)
1931
1934
  if frames:
1932
1935
  logger.info(" stack:")
@@ -1965,7 +1968,7 @@ def a_execute_agentic_test_cases_from_loop(
1965
1968
  if not leftovers:
1966
1969
  return
1967
1970
 
1968
- if settings.DEEPEVAL_DEBUG_ASYNC:
1971
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1969
1972
  logger.warning(
1970
1973
  "[deepeval] %d stray task(s) not tracked; cancelling...",
1971
1974
  len(leftovers),
@@ -1985,7 +1988,7 @@ def a_execute_agentic_test_cases_from_loop(
1985
1988
  )
1986
1989
  except RuntimeError:
1987
1990
  # If the loop is closing here, just continue
1988
- if settings.DEEPEVAL_DEBUG_ASYNC:
1991
+ if get_settings().DEEPEVAL_DEBUG_ASYNC:
1989
1992
  logger.warning(
1990
1993
  "[deepeval] failed to drain stray tasks because loop is closing"
1991
1994
  )
@@ -1,5 +1,7 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
+ import asyncio
4
+
3
5
  from typing import Optional, List, Tuple, Union, Type
4
6
  from deepeval.metrics import BaseMetric
5
7
  from deepeval.test_case import (
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
16
18
  )
17
19
  from deepeval.models import DeepEvalBaseLLM
18
20
  from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.g_eval.schema import *
21
+ from deepeval.metrics.g_eval import schema as gschema
20
22
  from deepeval.metrics.g_eval.utils import (
21
23
  Rubric,
22
24
  construct_g_eval_params_string,
@@ -29,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
29
31
  number_evaluation_steps,
30
32
  get_score_range,
31
33
  )
34
+ from deepeval.config.settings import get_settings
32
35
 
33
36
 
34
37
  class GEval(BaseMetric):
@@ -81,12 +84,16 @@ class GEval(BaseMetric):
81
84
  ):
82
85
  if self.async_mode:
83
86
  loop = get_or_create_event_loop()
87
+ coro = self.a_measure(
88
+ test_case,
89
+ _show_indicator=False,
90
+ _in_component=_in_component,
91
+ _additional_context=_additional_context,
92
+ )
84
93
  loop.run_until_complete(
85
- self.a_measure(
86
- test_case,
87
- _show_indicator=False,
88
- _in_component=_in_component,
89
- _additional_context=_additional_context,
94
+ asyncio.wait_for(
95
+ coro,
96
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
90
97
  )
91
98
  )
92
99
  else:
@@ -177,7 +184,9 @@ class GEval(BaseMetric):
177
184
  return data["steps"]
178
185
  else:
179
186
  try:
180
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
187
+ res: gschema.Steps = await self.model.a_generate(
188
+ prompt, schema=gschema.Steps
189
+ )
181
190
  return res.steps
182
191
  except TypeError:
183
192
  res = await self.model.a_generate(prompt)
@@ -201,7 +210,9 @@ class GEval(BaseMetric):
201
210
  return data["steps"]
202
211
  else:
203
212
  try:
204
- res: Steps = self.model.generate(prompt, schema=Steps)
213
+ res: gschema.Steps = self.model.generate(
214
+ prompt, schema=gschema.Steps
215
+ )
205
216
  return res.steps
206
217
  except TypeError:
207
218
  res = self.model.generate(prompt)
@@ -264,7 +275,7 @@ class GEval(BaseMetric):
264
275
  score, res
265
276
  )
266
277
  return weighted_summed_score, reason
267
- except:
278
+ except (KeyError, AttributeError, TypeError, ValueError):
268
279
  return score, reason
269
280
  except (
270
281
  AttributeError
@@ -276,8 +287,8 @@ class GEval(BaseMetric):
276
287
  return data["score"], data["reason"]
277
288
  else:
278
289
  try:
279
- res: ReasonScore = await self.model.a_generate(
280
- prompt, schema=ReasonScore
290
+ res: gschema.ReasonScore = await self.model.a_generate(
291
+ prompt, schema=gschema.ReasonScore
281
292
  )
282
293
  return res.score, res.reason
283
294
  except TypeError:
@@ -338,7 +349,7 @@ class GEval(BaseMetric):
338
349
  score, res
339
350
  )
340
351
  return weighted_summed_score, reason
341
- except:
352
+ except (KeyError, AttributeError, TypeError, ValueError):
342
353
  return score, reason
343
354
  except AttributeError:
344
355
  # This catches the case where a_generate_raw_response doesn't exist.
@@ -349,8 +360,8 @@ class GEval(BaseMetric):
349
360
  return data["score"], data["reason"]
350
361
  else:
351
362
  try:
352
- res: ReasonScore = self.model.generate(
353
- prompt, schema=ReasonScore
363
+ res: gschema.ReasonScore = self.model.generate(
364
+ prompt, schema=gschema.ReasonScore
354
365
  )
355
366
  return res.score, res.reason
356
367
  except TypeError:
@@ -364,7 +375,7 @@ class GEval(BaseMetric):
364
375
  else:
365
376
  try:
366
377
  self.success = self.score >= self.threshold
367
- except:
378
+ except TypeError:
368
379
  self.success = False
369
380
  return self.success
370
381
 
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+
1
3
  from typing import Optional, List, Union
2
4
 
3
5
  from deepeval.utils import get_or_create_event_loop, prettify_list
@@ -15,7 +17,8 @@ from deepeval.metrics import BaseMetric
15
17
  from deepeval.models import DeepEvalBaseLLM
16
18
  from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
17
19
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.prompt_alignment.schema import *
20
+ from deepeval.metrics.prompt_alignment import schema as paschema
21
+ from deepeval.config.settings import get_settings
19
22
 
20
23
 
21
24
  class PromptAlignmentMetric(BaseMetric):
@@ -62,15 +65,19 @@ class PromptAlignmentMetric(BaseMetric):
62
65
  ):
63
66
  if self.async_mode:
64
67
  loop = get_or_create_event_loop()
68
+ coro = self.a_measure(
69
+ test_case,
70
+ _show_indicator=False,
71
+ _in_component=_in_component,
72
+ )
65
73
  loop.run_until_complete(
66
- self.a_measure(
67
- test_case,
68
- _show_indicator=False,
69
- _in_component=_in_component,
74
+ asyncio.wait_for(
75
+ coro,
76
+ timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
70
77
  )
71
78
  )
72
79
  else:
73
- self.verdicts: Verdicts = self._generate_verdicts(
80
+ self.verdicts: paschema.Verdicts = self._generate_verdicts(
74
81
  test_case.input, test_case.actual_output
75
82
  )
76
83
  self.score = self._calculate_score()
@@ -105,7 +112,7 @@ class PromptAlignmentMetric(BaseMetric):
105
112
  _show_indicator=_show_indicator,
106
113
  _in_component=_in_component,
107
114
  ):
108
- self.verdicts: Verdicts = await self._a_generate_verdicts(
115
+ self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
109
116
  test_case.input, test_case.actual_output
110
117
  )
111
118
  self.score = self._calculate_score()
@@ -141,14 +148,17 @@ class PromptAlignmentMetric(BaseMetric):
141
148
  )
142
149
  if self.using_native_model:
143
150
  res, cost = await self.model.a_generate(
144
- prompt, schema=PromptAlignmentScoreReason
151
+ prompt, schema=paschema.PromptAlignmentScoreReason
145
152
  )
146
153
  self.evaluation_cost += cost
147
154
  return res.reason
148
155
  else:
149
156
  try:
150
- res: PromptAlignmentScoreReason = await self.model.a_generate(
151
- prompt=prompt, schema=PromptAlignmentScoreReason
157
+ res: paschema.PromptAlignmentScoreReason = (
158
+ await self.model.a_generate(
159
+ prompt=prompt,
160
+ schema=paschema.PromptAlignmentScoreReason,
161
+ )
152
162
  )
153
163
  return res.reason
154
164
  except TypeError:
@@ -173,14 +183,14 @@ class PromptAlignmentMetric(BaseMetric):
173
183
  )
174
184
  if self.using_native_model:
175
185
  res, cost = self.model.generate(
176
- prompt, schema=PromptAlignmentScoreReason
186
+ prompt, schema=paschema.PromptAlignmentScoreReason
177
187
  )
178
188
  self.evaluation_cost += cost
179
189
  return res.reason
180
190
  else:
181
191
  try:
182
- res: PromptAlignmentScoreReason = self.model.generate(
183
- prompt=prompt, schema=PromptAlignmentScoreReason
192
+ res: paschema.PromptAlignmentScoreReason = self.model.generate(
193
+ prompt=prompt, schema=paschema.PromptAlignmentScoreReason
184
194
  )
185
195
  return res.reason
186
196
  except TypeError:
@@ -190,48 +200,56 @@ class PromptAlignmentMetric(BaseMetric):
190
200
 
191
201
  async def _a_generate_verdicts(
192
202
  self, input: str, actual_output: str
193
- ) -> Verdicts:
203
+ ) -> paschema.Verdicts:
194
204
  prompt = PromptAlignmentTemplate.generate_verdicts(
195
205
  prompt_instructions=self.prompt_instructions,
196
206
  input=input,
197
207
  actual_output=actual_output,
198
208
  )
199
209
  if self.using_native_model:
200
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
210
+ res, cost = await self.model.a_generate(
211
+ prompt, schema=paschema.Verdicts
212
+ )
201
213
  self.evaluation_cost += cost
202
214
  return [item for item in res.verdicts]
203
215
  else:
204
216
  try:
205
- res: Verdicts = await self.model.a_generate(
206
- prompt, schema=Verdicts
217
+ res: paschema.Verdicts = await self.model.a_generate(
218
+ prompt, schema=paschema.Verdicts
207
219
  )
208
220
  return [item for item in res.verdicts]
209
221
  except TypeError:
210
222
  res = await self.model.a_generate(prompt)
211
223
  data = trimAndLoadJson(res, self)
212
224
  return [
213
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
225
+ paschema.PromptAlignmentVerdict(**item)
226
+ for item in data["verdicts"]
214
227
  ]
215
228
 
216
- def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts:
229
+ def _generate_verdicts(
230
+ self, input: str, actual_output: str
231
+ ) -> paschema.Verdicts:
217
232
  prompt = PromptAlignmentTemplate.generate_verdicts(
218
233
  prompt_instructions=self.prompt_instructions,
219
234
  input=input,
220
235
  actual_output=actual_output,
221
236
  )
222
237
  if self.using_native_model:
223
- res, cost = self.model.generate(prompt, schema=Verdicts)
238
+ res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
224
239
  self.evaluation_cost += cost
225
240
  return [item for item in res.verdicts]
226
241
  else:
227
242
  try:
228
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
243
+ res: paschema.Verdicts = self.model.generate(
244
+ prompt, schema=paschema.Verdicts
245
+ )
229
246
  return [item for item in res.verdicts]
230
247
  except TypeError:
231
248
  res = self.model.generate(prompt)
232
249
  data = trimAndLoadJson(res, self)
233
250
  return [
234
- PromptAlignmentVerdict(**item) for item in data["verdicts"]
251
+ paschema.PromptAlignmentVerdict(**item)
252
+ for item in data["verdicts"]
235
253
  ]
236
254
 
237
255
  def _calculate_score(self):
@@ -253,7 +271,7 @@ class PromptAlignmentMetric(BaseMetric):
253
271
  else:
254
272
  try:
255
273
  self.success = self.score >= self.threshold
256
- except:
274
+ except TypeError:
257
275
  self.success = False
258
276
  return self.success
259
277
 
@@ -33,9 +33,13 @@ Retry logging (settings; read at call time):
33
33
 
34
34
  from __future__ import annotations
35
35
 
36
+ import asyncio
37
+ import inspect
38
+ import itertools
39
+ import functools
40
+ import threading
36
41
  import logging
37
42
 
38
- from deepeval.utils import read_env_int, read_env_float
39
43
  from dataclasses import dataclass, field
40
44
  from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
41
45
  from collections.abc import Mapping as ABCMapping
@@ -58,6 +62,9 @@ from deepeval.config.settings import get_settings
58
62
 
59
63
  logger = logging.getLogger(__name__)
60
64
  Provider = Union[str, PS]
65
+ _MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
66
+ _TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
67
+ _WORKER_ID = itertools.count(1)
61
68
 
62
69
  # --------------------------
63
70
  # Policy description
@@ -184,6 +191,12 @@ def extract_error_code(
184
191
  # Predicate factory
185
192
  # --------------------------
186
193
 
194
+ _BUILTIN_TIMEOUT_EXCS = (
195
+ (TimeoutError,)
196
+ if asyncio.TimeoutError is TimeoutError
197
+ else (TimeoutError, asyncio.TimeoutError)
198
+ )
199
+
187
200
 
188
201
  def make_is_transient(
189
202
  policy: ErrorPolicy,
@@ -213,6 +226,9 @@ def make_is_transient(
213
226
  )
214
227
 
215
228
  def _pred(e: Exception) -> bool:
229
+ if isinstance(e, _BUILTIN_TIMEOUT_EXCS):
230
+ return True
231
+
216
232
  if isinstance(e, policy.auth_excs):
217
233
  return False
218
234
 
@@ -245,18 +261,23 @@ def make_is_transient(
245
261
 
246
262
  class StopFromEnv(stop_base):
247
263
  def __call__(self, retry_state):
248
- attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
264
+ settings = get_settings()
265
+ attempts = (
266
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS
267
+ ) # TODO: add constraints in settings
249
268
  return stop_after_attempt(attempts)(retry_state)
250
269
 
251
270
 
252
271
  class WaitFromEnv(wait_base):
253
272
  def __call__(self, retry_state):
254
- initial = read_env_float(
255
- "DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
256
- )
257
- exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
258
- jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
259
- cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
273
+ settings = get_settings()
274
+ initial = settings.DEEPEVAL_RETRY_INITIAL_SECONDS
275
+ exp_base = settings.DEEPEVAL_RETRY_EXP_BASE
276
+ jitter = settings.DEEPEVAL_RETRY_JITTER
277
+ cap = settings.DEEPEVAL_RETRY_CAP_SECONDS
278
+
279
+ if cap == 0: # <- 0 means no backoff sleeps or jitter
280
+ return 0
260
281
  return wait_exponential_jitter(
261
282
  initial=initial, exp_base=exp_base, jitter=jitter, max=cap
262
283
  )(retry_state)
@@ -324,10 +345,11 @@ def dynamic_retry(provider: Provider):
324
345
 
325
346
  def _retry_log_levels():
326
347
  s = get_settings()
348
+ base_level = s.LOG_LEVEL if s.LOG_LEVEL is not None else logging.INFO
327
349
  before_level = s.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL
328
350
  after_level = s.DEEPEVAL_RETRY_AFTER_LOG_LEVEL
329
351
  return (
330
- before_level if before_level is not None else logging.INFO,
352
+ before_level if before_level is not None else base_level,
331
353
  after_level if after_level is not None else logging.ERROR,
332
354
  )
333
355
 
@@ -394,21 +416,190 @@ def make_after_log(slug: str):
394
416
  return _after
395
417
 
396
418
 
419
+ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
420
+ settings = get_settings()
421
+ if logger.isEnabledFor(logging.DEBUG):
422
+ logger.debug(
423
+ "retry config: per_attempt=%s s, max_attempts=%s, per_task_budget=%s s",
424
+ timeout_seconds,
425
+ settings.DEEPEVAL_RETRY_MAX_ATTEMPTS,
426
+ settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
427
+ )
428
+ msg = (
429
+ f"call timed out after {timeout_seconds:g}s (per attempt). "
430
+ "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
431
+ )
432
+ return TimeoutError(msg)
433
+
434
+
435
+ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
436
+ """
437
+ Run a synchronous callable with a soft timeout enforced by a helper thread,
438
+ with a global cap on concurrent timeout-workers.
439
+
440
+ How it works
441
+ ------------
442
+ - A module-level BoundedSemaphore (size = settings.DEEPEVAL_TIMEOUT_THREAD_LIMIT)
443
+ gates creation of timeout worker threads. If no permit is available, this call
444
+ blocks until a slot frees up. If settings.DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS
445
+ > 0 and acquisition takes longer than that, a warning is logged before continuing
446
+ to wait.
447
+ - Once a permit is acquired, a daemon thread executes `func(*args, **kwargs)`.
448
+ - We wait up to `timeout_seconds` for completion. If the timeout elapses, we raise
449
+ `TimeoutError`. The worker thread is not killed, it continues and releases the semaphore when it eventually finishes.
450
+ - If the worker finishes in time, we return its result or re-raise its exception
451
+ (with original traceback).
452
+
453
+ Cancellation semantics
454
+ ----------------------
455
+ This is a soft timeout: Python threads cannot be forcibly terminated. When timeouts
456
+ are rare this is fine. If timeouts are common, consider moving to:
457
+ - a shared ThreadPoolExecutor (caps threads and amortizes creation), or
458
+ - worker process (supports killing in-flight processes)
459
+
460
+ Concurrency control & logging
461
+ -----------------------------
462
+ - Concurrency is bounded by `DEEPEVAL_TIMEOUT_THREAD_LIMIT`.
463
+ - If acquisition exceeds `DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS`, we log a
464
+ warning and then block until a slot is available.
465
+ - On timeout, if DEBUG is enabled and `DEEPEVAL_VERBOSE_MODE` is True, we log a short
466
+ thread sample to help diagnose pressure.
467
+
468
+ Args:
469
+ func: Synchronous callable to execute.
470
+ timeout_seconds: Float seconds for the soft timeout (0/None disables).
471
+ *args, **kwargs: Passed through to `func`.
472
+
473
+ Returns:
474
+ Whatever `func` returns.
475
+
476
+ Raises:
477
+ TimeoutError: If `timeout_seconds` elapse before completion.
478
+ BaseException: If `func` raises, the same exception is re-raised with its
479
+ original traceback.
480
+ """
481
+ if not timeout_seconds or timeout_seconds <= 0:
482
+ return func(*args, **kwargs)
483
+
484
+ # try to respect the global cap on concurrent timeout workers
485
+ warn_after = float(
486
+ get_settings().DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS or 0.0
487
+ )
488
+ if warn_after > 0:
489
+ acquired = _TIMEOUT_SEMA.acquire(timeout=warn_after)
490
+ if not acquired:
491
+ logger.warning(
492
+ "timeout thread limit reached (%d); waiting for a slot...",
493
+ _MAX_TIMEOUT_THREADS,
494
+ )
495
+ _TIMEOUT_SEMA.acquire()
496
+ else:
497
+ _TIMEOUT_SEMA.acquire()
498
+
499
+ done = threading.Event()
500
+ result = {"value": None, "exc": None}
501
+
502
+ def target():
503
+ try:
504
+ result["value"] = func(*args, **kwargs)
505
+ except BaseException as e:
506
+ result["exc"] = e
507
+ finally:
508
+ done.set()
509
+ _TIMEOUT_SEMA.release()
510
+
511
+ t = threading.Thread(
512
+ target=target,
513
+ daemon=True,
514
+ name=f"deepeval-timeout-worker-{next(_WORKER_ID)}",
515
+ )
516
+
517
+ try:
518
+ t.start()
519
+ except BaseException:
520
+ _TIMEOUT_SEMA.release()
521
+ raise
522
+
523
+ finished = done.wait(timeout_seconds)
524
+ if not finished:
525
+ if (
526
+ logger.isEnabledFor(logging.DEBUG)
527
+ and get_settings().DEEPEVAL_VERBOSE_MODE
528
+ ):
529
+ names = [th.name for th in threading.enumerate()[:10]]
530
+ logger.debug(
531
+ "timeout after %.3fs (active_threads=%d, sample=%s)",
532
+ timeout_seconds,
533
+ threading.active_count(),
534
+ names,
535
+ )
536
+ raise _make_timeout_error(timeout_seconds)
537
+
538
+ # Completed within time: return or raise
539
+ if result["exc"] is not None:
540
+ exc = result["exc"]
541
+ raise exc.with_traceback(getattr(exc, "__traceback__", None))
542
+ return result["value"]
543
+
544
+
397
545
  def create_retry_decorator(provider: Provider):
398
546
  """
399
547
  Build a Tenacity @retry decorator wired to our dynamic retry policy
400
548
  for the given provider slug.
401
549
  """
402
550
  slug = slugify(provider)
403
-
404
- return retry(
551
+ base_retry = retry(
405
552
  wait=dynamic_wait(),
406
553
  stop=dynamic_stop(),
407
554
  retry=dynamic_retry(slug),
408
555
  before_sleep=make_before_sleep_log(slug),
409
556
  after=make_after_log(slug),
557
+ reraise=False,
410
558
  )
411
559
 
560
+ def _decorator(func):
561
+ if inspect.iscoroutinefunction(func):
562
+
563
+ @functools.wraps(func)
564
+ async def attempt(*args, **kwargs):
565
+ timeout_seconds = (
566
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
567
+ )
568
+ coro = func(*args, **kwargs)
569
+ if timeout_seconds > 0:
570
+ try:
571
+ return await asyncio.wait_for(coro, timeout_seconds)
572
+ except asyncio.TimeoutError as e:
573
+ if (
574
+ logger.isEnabledFor(logging.DEBUG)
575
+ and get_settings().DEEPEVAL_VERBOSE_MODE is True
576
+ ):
577
+ logger.debug(
578
+ "async timeout after %.3fs (active_threads=%d, tasks=%d)",
579
+ timeout_seconds,
580
+ threading.active_count(),
581
+ len(asyncio.all_tasks()),
582
+ )
583
+ raise _make_timeout_error(timeout_seconds) from e
584
+ return await coro
585
+
586
+ return base_retry(attempt)
587
+
588
+ @functools.wraps(func)
589
+ def attempt(*args, **kwargs):
590
+ timeout_seconds = (
591
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
592
+ )
593
+ if timeout_seconds > 0:
594
+ return _run_sync_with_timeout(
595
+ func, timeout_seconds, *args, **kwargs
596
+ )
597
+ return func(*args, **kwargs)
598
+
599
+ return base_retry(attempt)
600
+
601
+ return _decorator
602
+
412
603
 
413
604
  def _httpx_net_excs() -> tuple[type, ...]:
414
605
  try:
@@ -90,12 +90,6 @@ class ConfidentSpanExporter(SpanExporter):
90
90
  api_key: Optional[str] = None, # dynamic api key,
91
91
  _test_run_id: Optional[str] = None,
92
92
  ) -> SpanExportResult:
93
- # build forest of spans
94
- # for span in spans:
95
- # print("--------------------------------")
96
- # print(span.to_json())
97
- # print("--------------------------------")
98
- # return SpanExportResult.SUCCESS
99
93
 
100
94
  ################ Build Forest of Spans ################
101
95
  forest = self._build_span_forest(spans)
@@ -109,8 +109,24 @@ def check_llm_input_from_gen_ai_attributes(
109
109
  input = None
110
110
  output = None
111
111
  try:
112
- input = json.loads(span.attributes.get("gen_ai.input.messages"))
113
- input = _flatten_input(input)
112
+ # check for system instructions
113
+ system_instructions = []
114
+ system_instructions_raw = span.attributes.get(
115
+ "gen_ai.system_instructions"
116
+ )
117
+ if system_instructions_raw and isinstance(system_instructions_raw, str):
118
+ system_instructions_json = json.loads(system_instructions_raw)
119
+ system_instructions = _flatten_system_instructions(
120
+ system_instructions_json
121
+ )
122
+
123
+ input_messages = []
124
+ input_messages_raw = span.attributes.get("gen_ai.input.messages")
125
+ if input_messages_raw and isinstance(input_messages_raw, str):
126
+ input_messages_json = json.loads(input_messages_raw)
127
+ input_messages = _flatten_input(input_messages_json)
128
+
129
+ input = system_instructions + input_messages
114
130
 
115
131
  except Exception:
116
132
  pass
@@ -137,6 +153,20 @@ def check_llm_input_from_gen_ai_attributes(
137
153
  return input, output
138
154
 
139
155
 
156
+ def _flatten_system_instructions(system_instructions: list) -> list:
157
+ if isinstance(system_instructions, list):
158
+ for system_instruction in system_instructions:
159
+ if isinstance(system_instruction, dict):
160
+ role = system_instruction.get("role")
161
+ if not role:
162
+ system_instruction["role"] = "System Instruction"
163
+ return _flatten_input(system_instructions)
164
+ elif isinstance(system_instructions, str):
165
+ return [{"role": "System Instruction", "content": system_instructions}]
166
+
167
+ return []
168
+
169
+
140
170
  def _flatten_input(input: list) -> list:
141
171
  if input and isinstance(input, list):
142
172
  try:
@@ -411,10 +441,23 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
411
441
  return None
412
442
 
413
443
 
444
+ def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
445
+
446
+ if isinstance(message, dict) and message.get("role") == "assistant":
447
+ parts = message.get("parts")
448
+ if parts:
449
+ # Iterate from the last part
450
+ for part in reversed(parts):
451
+ if isinstance(part, dict) and part.get("type") == "text":
452
+ # Return a modified message with only the text content
453
+ return {"role": "assistant", "content": part.get("content")}
454
+ return None
455
+
456
+
414
457
  def check_pydantic_ai_agent_input_output(
415
458
  span: ReadableSpan,
416
459
  ) -> Tuple[Optional[Any], Optional[Any]]:
417
- input_val: Optional[Any] = None
460
+ input_val: list = []
418
461
  output_val: Optional[Any] = None
419
462
 
420
463
  # Get normalized messages once
@@ -445,14 +488,21 @@ def check_pydantic_ai_agent_input_output(
445
488
  if span.attributes.get("confident.span.type") == "agent":
446
489
  output_val = span.attributes.get("final_result")
447
490
  if not output_val and normalized:
448
- # Extract the last message if no final_result is available
449
- output_val = normalized[-1]
491
+ output_val = _extract_non_thinking_part_of_last_message(
492
+ normalized[-1]
493
+ )
450
494
  except Exception:
451
495
  pass
452
496
 
497
+ system_instructions = []
498
+ system_instruction_raw = span.attributes.get("gen_ai.system_instructions")
499
+ if system_instruction_raw and isinstance(system_instruction_raw, str):
500
+ system_instructions = _flatten_system_instructions(
501
+ json.loads(system_instruction_raw)
502
+ )
503
+
453
504
  input_val = _flatten_input(input_val)
454
- output_val = _flatten_input(output_val)
455
- return input_val, output_val
505
+ return system_instructions + input_val, output_val
456
506
 
457
507
 
458
508
  def check_tool_output(span: ReadableSpan):
@@ -0,0 +1,19 @@
1
+ from typing import Optional, Dict, Any
2
+ import asyncio
3
+ from time import monotonic
4
+
5
+
6
+ class TraceTestingManager:
7
+ test_name: Optional[str] = None
8
+ test_dict: Optional[Dict[str, Any]] = None
9
+
10
+ async def wait_for_test_dict(
11
+ self, timeout: float = 10.0, poll_interval: float = 0.05
12
+ ) -> Dict[str, Any]:
13
+ deadline = monotonic() + timeout
14
+ while self.test_dict is None and monotonic() < deadline:
15
+ await asyncio.sleep(poll_interval)
16
+ return self.test_dict or {}
17
+
18
+
19
+ trace_testing_manager = TraceTestingManager()
@@ -53,7 +53,7 @@ from deepeval.utils import dataclass_to_dict
53
53
  from deepeval.tracing.context import current_span_context, current_trace_context
54
54
  from deepeval.tracing.types import TestCaseMetricPair
55
55
  from deepeval.tracing.api import PromptApi
56
- from tests.test_integrations.manager import trace_testing_manager
56
+ from deepeval.tracing.trace_test_manager import trace_testing_manager
57
57
 
58
58
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
59
59
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.6.5
3
+ Version: 3.6.6
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
- deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
2
- deepeval/_version.py,sha256=7XydZTr-OhyEmxjczbOo90U1nYQK6hBYF4GXri8UIcY,27
1
+ deepeval/__init__.py,sha256=IqShG98ALpA1gm_qL2Jq56AJoafAHpcUTSvpgH4HpZM,3062
2
+ deepeval/_version.py,sha256=yb70ATorTjss8Uu310wa6TkPe0yTadiC7Lxb0-KZxMA,27
3
3
  deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
4
4
  deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
5
5
  deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -141,7 +141,8 @@ deepeval/confident/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVG
141
141
  deepeval/confident/api.py,sha256=2ZhrQOtfxcnQSyY6OxrjY17y1yn-NB7pfIiJa20B1Pk,8519
142
142
  deepeval/confident/types.py,sha256=-slFhDof_1maMgpLxqDRZv6kz6ZVY2hP_0uj_aveJKU,533
143
143
  deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
- deepeval/config/settings.py,sha256=EwCcYQYQDuayRVmRjYPtxUm9EHKLRu1eLaGfzNdaDEI,21827
144
+ deepeval/config/logging.py,sha256=ivqmhOSB-oHOOU3MvnhImrZwkkxzxKJgoKxesnWfHjg,1314
145
+ deepeval/config/settings.py,sha256=Ifw7HDSTaYCFk3zGHRf15uMCDZuy3NXAuNtWm4jcIUA,27575
145
146
  deepeval/config/settings_manager.py,sha256=enahSZN8krRu7-L94OBCt99fwUIqQtMRL97PlzsuKEY,4021
146
147
  deepeval/config/utils.py,sha256=gSOVv18Tx1R72GucbdQesbZLFL-Y9EzbS4p7qd2w_xE,3799
147
148
  deepeval/constants.py,sha256=Qe-es-WDPJndgBspEQXxddDCVanrAu03YWCpXsUkdo0,1368
@@ -159,7 +160,7 @@ deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
159
160
  deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
160
161
  deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
161
162
  deepeval/evaluate/evaluate.py,sha256=HoEERRLj8SVCcU1r70VQdSL4LQcSc9p20OhcD1nhEuQ,10594
162
- deepeval/evaluate/execute.py,sha256=M0o4dpUSkvXnzEK6QIgy-2pa0HQx6w6ZRbXoI03tJeI,88931
163
+ deepeval/evaluate/execute.py,sha256=vkiWaQGBAFbLIJ1tTYpGpu1YDpDSpH6o-oPftqPlNpM,88875
163
164
  deepeval/evaluate/types.py,sha256=zsL_lNbFMG20czzRQeWNDbLSzL8Uy7IIgvILe-X0kN0,918
164
165
  deepeval/evaluate/utils.py,sha256=oBJFcUDYmmsRvXW7rXkQy3gI1Tuu5bixgvHx0yvnw1c,23563
165
166
  deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -240,7 +241,7 @@ deepeval/metrics/faithfulness/faithfulness.py,sha256=KXI5VPcOsPZ2Pk1-69tR-kq4x-F
240
241
  deepeval/metrics/faithfulness/schema.py,sha256=2dU9dwwmqpGJcWvY2webERWIfH_tn02xgLghHkAY_eM,437
241
242
  deepeval/metrics/faithfulness/template.py,sha256=RuZ0LFm4BjZ8lhVrKPgU3ecHszwkF0fe5-BxAkaP5AA,5839
242
243
  deepeval/metrics/g_eval/__init__.py,sha256=HAhsQFVq9LIpZXPN00Jc_WrMXrh47NIT86VnUpWM4_4,102
243
- deepeval/metrics/g_eval/g_eval.py,sha256=CaW7VHPW-SyXt18IE1rSatgagY238s3It-j6SLRI4H4,14395
244
+ deepeval/metrics/g_eval/g_eval.py,sha256=zd4_M7UaT_l1GxHrA_g9nzCl5LXH-NYzKYLEWKLhpaU,14875
244
245
  deepeval/metrics/g_eval/schema.py,sha256=V629txuDrr_2IEKEsgJVYYZb_pkdfcltQV9ZjvxK5co,287
245
246
  deepeval/metrics/g_eval/template.py,sha256=mHj4-mr_HQwbCjpHg7lM_6UesoSatL3g8UGGQAOdT0U,4509
246
247
  deepeval/metrics/g_eval/utils.py,sha256=uUT86jRXVYvLDzcnZvvfWssDyGoBHb66nWcJSg4i1u4,8784
@@ -327,7 +328,7 @@ deepeval/metrics/pii_leakage/pii_leakage.py,sha256=EIQMS_hOiYhEW5x4nYJwS6AhWl9jh
327
328
  deepeval/metrics/pii_leakage/schema.py,sha256=Jk9jdf4HAa76J237mnosWOCV71pBBNdLfaVhf-4dKEg,313
328
329
  deepeval/metrics/pii_leakage/template.py,sha256=DEW21CyR2lEI1y2C_fXgZnGJlYw0fvnB-LF-HEKZnqo,2418
329
330
  deepeval/metrics/prompt_alignment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
- deepeval/metrics/prompt_alignment/prompt_alignment.py,sha256=vQQa-nNBYXsHcR6kyPW0Efcqu3jRoahh2ZM0Ft9Cq8Y,9481
331
+ deepeval/metrics/prompt_alignment/prompt_alignment.py,sha256=JW6AjOUvJssTTwOzggT5QPp4zAb5Z4e2H-xQS83Pv3M,10083
331
332
  deepeval/metrics/prompt_alignment/schema.py,sha256=ann3tH5XfZCLhePE2UXTSK_gkF-_udP0RP_IVHW4mF0,315
332
333
  deepeval/metrics/prompt_alignment/template.py,sha256=6-A1rWOZWOEauSsQAXC-gVF2oXkYfgV0XqiriGJJfY0,3182
333
334
  deepeval/metrics/ragas.py,sha256=I4EdwbsRGHlSEraKFBrDGUBrzRUBqlWqJAGg0efrZ0w,17637
@@ -386,7 +387,7 @@ deepeval/models/mlllms/__init__.py,sha256=19nN6kUB5XI0nUWUQX0aD9GBUMM8WWGvsDgKju
386
387
  deepeval/models/mlllms/gemini_model.py,sha256=7tHIWD4w_fBz3L7jkKWygn1QpBPk9nl2Kw-yb0Jc3PI,10167
387
388
  deepeval/models/mlllms/ollama_model.py,sha256=_YtYtw8oIMVVI-CFsDicsdeEJUPhw_9ArPxB_1olsJA,4798
388
389
  deepeval/models/mlllms/openai_model.py,sha256=KgvYgQwWZ1A_Gcl6-4-W7IMqbUF9K8sNY37j5Ag7kQQ,9014
389
- deepeval/models/retry_policy.py,sha256=U7DjJJcCPei14Ws_7U6-JK8ZZYCGVx9YgMPySYr0HtM,24545
390
+ deepeval/models/retry_policy.py,sha256=efMJwjQasAE_3fstfBBmxLOzxUxws26zHP5yvEBcPfQ,31458
390
391
  deepeval/models/summac_model.py,sha256=wKeH7pWQRXrTlzlIw_r1YCb8b7jUhWq6jUz9FiNUCSg,1992
391
392
  deepeval/models/unbias_model.py,sha256=umOMhQLTmnD7uOuhiQufEl4Wlti4q2s3EtKOpds7zhs,597
392
393
  deepeval/models/utils.py,sha256=-3XDgg1U7PZ0jpLFiYXxqdBhp7idvlo7RPZv5SoD8lc,1130
@@ -453,18 +454,19 @@ deepeval/tracing/offline_evals/span.py,sha256=pXqTVXs-WnjRVpCYYEbNe0zSM6Wz9GsKHs
453
454
  deepeval/tracing/offline_evals/thread.py,sha256=bcSGFcZJKnszArOLIlWvnCyt0zSmsd7Xsw5rl4RTVFg,1981
454
455
  deepeval/tracing/offline_evals/trace.py,sha256=vTflaTKysKRiYvKA-Nx6PUJ3J6NrRLXiIdWieVcm90E,1868
455
456
  deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx3HEk,88
456
- deepeval/tracing/otel/exporter.py,sha256=wPO1ITKpjueLOSNLO6nD2QL9LAd8Xcu6en8hRkB61Wo,28891
457
+ deepeval/tracing/otel/exporter.py,sha256=eykoPzrRn3ljVO_JKzUeXThZ5iApyImGCDgnimIoUXs,28640
457
458
  deepeval/tracing/otel/test_exporter.py,sha256=bezihPGWJpwUEF3ZghxqhhorocVFTO2b43jRM-JHYMU,1076
458
- deepeval/tracing/otel/utils.py,sha256=yAXyPvTjax2HdLcvbVv9pyOVW4S7elIp3RLGuBTr_8o,15113
459
+ deepeval/tracing/otel/utils.py,sha256=Zw2_PcDm3Dtds1xUZWiVwVIKd4N98bVC8OZ_pkXv7qY,17190
459
460
  deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
460
461
  deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
461
462
  deepeval/tracing/trace_context.py,sha256=jmOH3oBKz1FeNz_J61CUfkuQ3SpyFc6n7mo_HVX6JfU,352
462
- deepeval/tracing/tracing.py,sha256=EhpZnKjYNCr_K5dTr9gqtK5uzKzhvE-lrk_t8OF5EOE,42903
463
+ deepeval/tracing/trace_test_manager.py,sha256=wt4y7EWTRc4Bw938-UFFtXHkdFFOrnx6JaIk7J5Iulw,555
464
+ deepeval/tracing/tracing.py,sha256=-9GE0fjtv5xKfGZHT9LLEt-38NbqkgXRp1uZ0U1W158,42907
463
465
  deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
464
466
  deepeval/tracing/utils.py,sha256=6SXJ7JJu-6OUziFZ_1IJppuVv7Rlq4cw3c3B7DL_eRQ,5295
465
467
  deepeval/utils.py,sha256=J1JNzjAlmn-UsFTK8-c5bhUuk5crwFGVCrRvle-nNmA,21533
466
- deepeval-3.6.5.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
467
- deepeval-3.6.5.dist-info/METADATA,sha256=jVsdK4BG21hV9kceAgzk5Ug34I0d1T7s-R5BIKGiHiQ,18754
468
- deepeval-3.6.5.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
469
- deepeval-3.6.5.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
470
- deepeval-3.6.5.dist-info/RECORD,,
468
+ deepeval-3.6.6.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
469
+ deepeval-3.6.6.dist-info/METADATA,sha256=n1o8egypf7Pr3YqgzhZ5ZnO7uSaZrWy3i5x4lyKoHmA,18754
470
+ deepeval-3.6.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
471
+ deepeval-3.6.6.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
472
+ deepeval-3.6.6.dist-info/RECORD,,