PyPI - deepeval - Versions diffs - 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl - Mend

deepeval 3.6.4py3-none-any.whl → 3.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

deepeval/__init__.py +42 -10
deepeval/_version.py +1 -1
deepeval/config/logging.py +33 -0
deepeval/config/settings.py +167 -12
deepeval/dataset/dataset.py +8 -2
deepeval/evaluate/evaluate.py +8 -2
deepeval/evaluate/execute.py +28 -30
deepeval/evaluate/types.py +4 -1
deepeval/evaluate/utils.py +46 -29
deepeval/integrations/crewai/__init__.py +1 -2
deepeval/integrations/crewai/handler.py +153 -81
deepeval/integrations/crewai/wrapper.py +87 -0
deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
deepeval/metrics/faithfulness/faithfulness.py +8 -0
deepeval/metrics/g_eval/g_eval.py +26 -15
deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
deepeval/models/retry_policy.py +202 -11
deepeval/test_run/__init__.py +2 -1
deepeval/test_run/api.py +1 -0
deepeval/test_run/test_run.py +85 -9
deepeval/tracing/__init__.py +2 -0
deepeval/tracing/otel/exporter.py +0 -6
deepeval/tracing/otel/test_exporter.py +35 -0
deepeval/tracing/otel/utils.py +57 -7
deepeval/tracing/trace_context.py +14 -0
deepeval/tracing/trace_test_manager.py +19 -0
deepeval/tracing/tracing.py +7 -6
deepeval/tracing/utils.py +2 -86
deepeval/utils.py +149 -1
{deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
{deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
deepeval/integrations/crewai/agent.py +0 -98
deepeval/integrations/crewai/patch.py +0 -41
{deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
{deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0

deepeval/__init__.py CHANGED Viewed

@@ -1,24 +1,56 @@
+from __future__ import annotations
+import logging
 import os
-import warnings
 import re
+import warnings
-# load environment variables before other imports
+# IMPORTANT: load environment variables before other imports
 from deepeval.config.settings import autoload_dotenv, get_settings
+logging.getLogger("deepeval").addHandler(logging.NullHandler())
 autoload_dotenv()
-from ._version import __version__
-from deepeval.evaluate import evaluate, assert_test
-from deepeval.evaluate.compare import compare
-from deepeval.test_run import on_test_run_end, log_hyperparameters
-from deepeval.utils import login
-from deepeval.telemetry import *
+def _expose_public_api() -> None:
+    # All other imports must happen after env is loaded
+    # Do not do this at module level or ruff will complain with E402
+    global __version__, evaluate, assert_test, compare
+    global on_test_run_end, log_hyperparameters, login, telemetry
+    from ._version import __version__ as _version
+    from deepeval.evaluate import (
+        evaluate as _evaluate,
+        assert_test as _assert_test,
+    )
+    from deepeval.evaluate.compare import compare as _compare
+    from deepeval.test_run import (
+        on_test_run_end as _on_end,
+        log_hyperparameters as _log_hparams,
+    )
+    from deepeval.utils import login as _login
+    import deepeval.telemetry as _telemetry
+    __version__ = _version
+    evaluate = _evaluate
+    assert_test = _assert_test
+    compare = _compare
+    on_test_run_end = _on_end
+    log_hyperparameters = _log_hparams
+    login = _login
+    telemetry = _telemetry
+_expose_public_api()
 settings = get_settings()
 if not settings.DEEPEVAL_GRPC_LOGGING:
-    os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
-    os.environ.setdefault("GRPC_TRACE", "")
+    if os.getenv("GRPC_VERBOSITY") is None:
+        os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
+    if os.getenv("GRPC_TRACE") is None:
+        os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
 __all__ = [

deepeval/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "3.6.4"
1	+ __version__: str = "3.6.6"

deepeval/config/logging.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Minimal logging configuration helpers for DeepEval.
+This module centralizes how the library-level logger ("deepeval") is configured. We
+intentionally keep configuration lightweight so application code retains control
+over handlers and formatters.
+"""
+import logging
+from deepeval.config.settings import get_settings
+def apply_deepeval_log_level() -> None:
+    """
+    Apply DeepEval's current log level to the package logger.
+    This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
+    and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
+    unset (None), INFO is used as a default. The logger's `propagate` flag is set
+    to True so records bubble up to the application's handlers. DeepEval does not
+    install its own handlers here (a NullHandler is attached in `__init__.py`).
+    The function is idempotent and safe to call multiple times. It is invoked
+    automatically when settings are first constructed and whenever `LOG_LEVEL`
+    is changed via `settings.edit`.
+    """
+    settings = get_settings()
+    log_level = settings.LOG_LEVEL
+    logging.getLogger("deepeval").setLevel(
+        log_level if log_level is not None else logging.INFO
+    )
+    # ensure we bubble up to app handlers
+    logging.getLogger("deepeval").propagate = True

deepeval/config/settings.py CHANGED Viewed

@@ -10,12 +10,20 @@ Central config for DeepEval.
 """
 import logging
+import math
 import os
 import re
 from dotenv import dotenv_values
 from pathlib import Path
-from pydantic import AnyUrl, SecretStr, field_validator, confloat
+from pydantic import (
+    AnyUrl,
+    computed_field,
+    confloat,
+    conint,
+    field_validator,
+    SecretStr,
+)
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing import Any, Dict, List, Optional, NamedTuple
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
     #
     APP_ENV: str = "dev"
-    LOG_LEVEL: str = "info"
+    LOG_LEVEL: Optional[int] = None
     PYTHONPATH: str = "."
     CONFIDENT_REGION: Optional[str] = None
     CONFIDENT_OPEN_BROWSER: Optional[bool] = True
@@ -180,6 +188,19 @@ class Settings(BaseSettings):
     # into this directory. The directory will be created on demand.
     DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
+    # Display / Truncation
+    DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
+    DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
+    DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
+    DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
+    # If set, this overrides the default max_len used by deepeval/utils shorten
+    # falls back to DEEPEVAL_MAXLEN_LONG when None.
+    DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
+    # Optional global suffix (keeps your "..." default).
+    DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
     #
     # GPU and perf toggles
     #
@@ -274,9 +295,33 @@ class Settings(BaseSettings):
     #
     # Retry Policy
     #
-    DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = None
-    DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = None  # default -> INFO
+    # Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
+    # Key concepts:
+    # - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
+    # - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
+    #   contribution is ~ JITTER/2 per sleep.
+    # - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
+    #   the retry loggers will honor it without restart.
+    DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
+        None  # ["*"] to delegate all retries to SDKs
+    )
+    DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
+        None  # default is LOG_LEVEL if set, else INFO
+    )
     DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None  # default -> ERROR
+    DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
+        2  # attempts = first try + retries
+    )
+    DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
+        1.0  # first sleep before retry, if any
+    )
+    DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
+        2.0  # exponential growth factor for sleeps
+    )
+    DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0  # uniform jitter
+    DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
+        5.0  # cap for each backoff sleep
+    )
     #
     # Telemetry and Debug
@@ -303,19 +348,87 @@ class Settings(BaseSettings):
     #
     MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
     MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
+    # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
+    # This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
+    # If you leave this at 0/None, the computed outer budget defaults to 180s.
+    DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
+        None  # per-attempt timeout. Set 0/None to disable
+    )
     #
     # Async Task Configuration
     #
-    # Maximum time allowed for a single task to complete
-    DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
-        300  # Set to float('inf') to disable timeout
-    )
+    DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
+    DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
+    # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
+    # It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
+    # - OVERRIDE = None or 0 -> auto compute as:
+    #     attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
+    #   (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
+    # - OVERRIDE > 0         -> use that exact value. A warning is logged if it is likely too small
+    #   to permit the configured attempts/backoff.
+    #
+    # Tip:
+    #   Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
+    #   Leave the outer budget on auto unless you have very strict SLAs.
+    DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
     # Buffer time for gathering results from all tasks, added to the longest task duration
     # Increase if many tasks are running concurrently
-    DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
+    DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
+    ###################
+    # Computed Fields #
+    ###################
+    def _calc_auto_outer_timeout(self) -> int:
+        """Compute outer budget from per-attempt timeout + retries/backoff.
+        Never reference the computed property itself here.
+        """
+        attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
+        timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
+        if timeout_seconds <= 0:
+            # No per-attempt timeout set -> default outer budget
+            return 180
+        sleeps = max(0, attempts - 1)
+        cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
+        cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
+        base = float(self.DEEPEVAL_RETRY_EXP_BASE)
+        jitter = float(self.DEEPEVAL_RETRY_JITTER)
+        backoff = 0.0
+        for _ in range(sleeps):
+            backoff += min(cap, cur)
+            cur *= base
+        backoff += sleeps * (jitter / 2.0)  # expected jitter
+        safety_overhead = 1.0
+        return int(
+            math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
+        )
+    @computed_field
+    @property
+    def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
+        """If OVERRIDE is set (nonzero), return it; else return the derived budget."""
+        outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
+        if outer not in (None, 0):
+            # Warn if user-provided outer is likely to truncate retries
+            if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
+                min_needed = self._calc_auto_outer_timeout()
+                if int(outer) < min_needed:
+                    if self.DEEPEVAL_VERBOSE_MODE:
+                        logger.warning(
+                            "Metric timeout (outer=%ss) is less than attempts × per-attempt "
+                            "timeout + backoff (≈%ss). Retries may be cut short.",
+                            int(outer),
+                            min_needed,
+                        )
+            return int(outer)
+        # Auto mode
+        return self._calc_auto_outer_timeout()
     ##############
     # Validators #
@@ -461,7 +574,9 @@ class Settings(BaseSettings):
             if s in SUPPORTED_PROVIDER_SLUGS:
                 normalized.append(s)
             else:
-                if cls.DEEPEVAL_VERBOSE_MODE:
+                if parse_bool(
+                    os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
+                ):
                     logger.warning("Unknown provider slug %r dropped", item)
         if star:
@@ -474,6 +589,7 @@ class Settings(BaseSettings):
     @field_validator(
         "DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
         "DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
+        "LOG_LEVEL",
         mode="before",
     )
     @classmethod
@@ -511,6 +627,10 @@ class Settings(BaseSettings):
     # Persistence support #
     #######################
     class _SettingsEditCtx:
+        COMPUTED_FIELDS: frozenset[str] = frozenset(
+            {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
+        )
         def __init__(
             self,
             settings: "Settings",
@@ -546,8 +666,11 @@ class Settings(BaseSettings):
             # lazy import legacy JSON store deps
             from deepeval.key_handler import KEY_FILE_HANDLER
+            model_fields = type(self._s).model_fields
+            # Exclude computed fields from persistence
             # compute diff of changed fields
-            after = {k: getattr(self._s, k) for k in type(self._s).model_fields}
+            after = {k: getattr(self._s, k) for k in model_fields}
             before_norm = {
                 k: _normalize_for_env(v) for k, v in self._before.items()
@@ -557,12 +680,21 @@ class Settings(BaseSettings):
             changed_keys = {
                 k for k in after_norm if after_norm[k] != before_norm.get(k)
             }
+            changed_keys -= self.COMPUTED_FIELDS
             if not changed_keys:
                 self.result = PersistResult(False, None, {})
                 return False
             updates = {k: after[k] for k in changed_keys}
+            if "LOG_LEVEL" in updates:
+                from deepeval.config.logging import (
+                    apply_deepeval_log_level,
+                )
+                apply_deepeval_log_level()
             #
             # .deepeval JSON support
             #
@@ -668,4 +800,27 @@ def get_settings() -> Settings:
     global _settings_singleton
     if _settings_singleton is None:
         _settings_singleton = Settings()
+        from deepeval.config.logging import apply_deepeval_log_level
+        apply_deepeval_log_level()
     return _settings_singleton
+def reset_settings(*, reload_dotenv: bool = False) -> Settings:
+    """
+    Drop the cached Settings singleton and rebuild it from the current process
+    environment.
+    Args:
+        reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
+                       which merges .env values into os.environ (never overwriting
+                       existing process env vars).
+    Returns:
+        The fresh Settings instance.
+    """
+    global _settings_singleton
+    if reload_dotenv:
+        autoload_dotenv()
+    _settings_singleton = None
+    return get_settings()

deepeval/dataset/dataset.py CHANGED Viewed

@@ -1266,11 +1266,17 @@ class EvaluationDataset:
                 detach(ctx_token)
             else:
-                confident_link = global_test_run_manager.wrap_up_test_run(
+                res = global_test_run_manager.wrap_up_test_run(
                     run_duration, display_table=False
                 )
+                if isinstance(res, tuple):
+                    confident_link, test_run_id = res
+                else:
+                    confident_link = test_run_id = None
                 return EvaluationResult(
-                    test_results=test_results, confident_link=confident_link
+                    test_results=test_results,
+                    confident_link=confident_link,
+                    test_run_id=test_run_id,
                 )
     def evaluate(self, task: Task):

deepeval/evaluate/evaluate.py CHANGED Viewed

@@ -268,11 +268,17 @@ def evaluate(
         test_run = global_test_run_manager.get_test_run()
         test_run.hyperparameters = process_hyperparameters(hyperparameters)
         global_test_run_manager.save_test_run(TEMP_FILE_PATH)
-        confident_link = global_test_run_manager.wrap_up_test_run(
+        res = global_test_run_manager.wrap_up_test_run(
             run_duration, display_table=False
         )
+        if isinstance(res, tuple):
+            confident_link, test_run_id = res
+        else:
+            confident_link = test_run_id = None
         return EvaluationResult(
-            test_results=test_results, confident_link=confident_link
+            test_results=test_results,
+            confident_link=confident_link,
+            test_run_id=test_run_id,
         )
     elif metric_collection:
         api = Api()

deepeval/evaluate/execute.py CHANGED Viewed

@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
 from deepeval.contextvars import set_current_golden, reset_current_golden
 from deepeval.errors import MissingTestCaseParamsError
 from deepeval.metrics.utils import copy_metrics
-from deepeval.utils import (
-    get_or_create_event_loop,
-)
+from deepeval.utils import get_or_create_event_loop, shorten, len_medium
 from deepeval.telemetry import capture_evaluation_run
 from deepeval.metrics import (
     BaseMetric,
@@ -93,7 +91,6 @@ from deepeval.config.settings import get_settings
 logger = logging.getLogger(__name__)
-settings = get_settings()
 async def _snapshot_tasks():
@@ -102,6 +99,18 @@ async def _snapshot_tasks():
     return {t for t in asyncio.all_tasks() if t is not cur}
+def _per_task_timeout() -> float:
+    return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+def _gather_timeout() -> float:
+    s = get_settings()
+    return (
+        s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+        + s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
+    )
 ###########################################
 ### E2E Evals #############################
 ###########################################
@@ -840,7 +849,7 @@ def execute_agentic_test_cases(
                         loop.run_until_complete(
                             asyncio.wait_for(
                                 coro,
-                                timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                                timeout=_per_task_timeout(),
                             )
                         )
                     else:
@@ -1198,7 +1207,7 @@ async def _a_execute_agentic_test_case(
             if asyncio.iscoroutinefunction(observed_callback):
                 await asyncio.wait_for(
                     observed_callback(golden.input),
-                    timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                    timeout=_per_task_timeout(),
                 )
             else:
                 observed_callback(golden.input)
@@ -1755,11 +1764,6 @@ def a_execute_agentic_test_cases_from_loop(
     _is_assert_test: bool = False,
 ) -> Iterator[TestResult]:
-    GATHER_TIMEOUT_SECONDS = (
-        settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-        + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
-    )
     semaphore = asyncio.Semaphore(async_config.max_concurrent)
     original_create_task = asyncio.create_task
@@ -1774,7 +1778,7 @@ def a_execute_agentic_test_cases_from_loop(
     async def execute_callback_with_semaphore(coroutine: Awaitable):
         async with semaphore:
             return await asyncio.wait_for(
-                coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
+                coroutine, timeout=_per_task_timeout()
             )
     def evaluate_test_cases(
@@ -1802,14 +1806,11 @@ def a_execute_agentic_test_cases_from_loop(
             )
             # record metadata for debugging
-            MAX_META_INPUT_LENGTH = 120
             started = time.perf_counter()
-            short_input = current_golden_ctx["input"]
-            if (
-                isinstance(short_input, str)
-                and len(short_input) > MAX_META_INPUT_LENGTH
-            ):
-                short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
+            short_input = current_golden_ctx.get("input")
+            if isinstance(short_input, str):
+                short_input = shorten(short_input, len_medium())
             task_meta[task] = {
                 "golden_index": current_golden_ctx["index"],
                 "golden_name": current_golden_ctx["name"],
@@ -1819,7 +1820,7 @@ def a_execute_agentic_test_cases_from_loop(
             }
             def on_task_done(t: asyncio.Task):
-                if settings.DEEPEVAL_DEBUG_ASYNC:
+                if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     # Using info level here to make it easy to spot these logs.
                     # We are gated by DEEPEVAL_DEBUG_ASYNC
                     meta = task_meta.get(t, {})
@@ -1893,7 +1894,7 @@ def a_execute_agentic_test_cases_from_loop(
                 loop.run_until_complete(
                     asyncio.wait_for(
                         asyncio.gather(*created_tasks, return_exceptions=True),
-                        timeout=GATHER_TIMEOUT_SECONDS,
+                        timeout=_gather_timeout(),
                     )
                 )
             except asyncio.TimeoutError:
@@ -1908,16 +1909,13 @@ def a_execute_agentic_test_cases_from_loop(
                     elapsed_time = time.perf_counter() - start_time
                     # Determine if it was a per task or gather timeout based on task's elapsed time
-                    if (
-                        elapsed_time
-                        >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
-                    ):
+                    if elapsed_time >= _per_task_timeout():
                         timeout_type = "per-task"
                     else:
                         timeout_type = "gather"
                     logger.warning(
-                        f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
+                        f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
                         f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
                         f"To give tasks more time, consider increasing "
                         f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
@@ -1931,7 +1929,7 @@ def a_execute_agentic_test_cases_from_loop(
                         elapsed_time,
                         meta,
                     )
-                    if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
+                    if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
                         frames = t.get_stack(limit=6)
                         if frames:
                             logger.info("    stack:")
@@ -1970,9 +1968,9 @@ def a_execute_agentic_test_cases_from_loop(
                 if not leftovers:
                     return
-                if settings.DEEPEVAL_DEBUG_ASYNC:
+                if get_settings().DEEPEVAL_DEBUG_ASYNC:
                     logger.warning(
-                        "[deepeval] %d stray task(s) not tracked; cancelling…",
+                        "[deepeval] %d stray task(s) not tracked; cancelling...",
                         len(leftovers),
                     )
                     for t in leftovers:
@@ -1990,7 +1988,7 @@ def a_execute_agentic_test_cases_from_loop(
                     )
                 except RuntimeError:
                     # If the loop is closing here, just continue
-                    if settings.DEEPEVAL_DEBUG_ASYNC:
+                    if get_settings().DEEPEVAL_DEBUG_ASYNC:
                         logger.warning(
                             "[deepeval] failed to drain stray tasks because loop is closing"
                         )

deepeval/evaluate/types.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import Optional, List, Union, Dict
 from dataclasses import dataclass
 from pydantic import BaseModel
-from deepeval.test_run import MetricData
+from deepeval.test_run.api import MetricData, TurnApi
 from deepeval.test_case import MLLMImage
@@ -19,9 +20,11 @@ class TestResult:
     expected_output: Optional[str] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
+    turns: Optional[List[TurnApi]] = None
     additional_metadata: Optional[Dict] = None
 class EvaluationResult(BaseModel):
     test_results: List[TestResult]
     confident_link: Optional[str]
+    test_run_id: Optional[str]

deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl

deepeval 3.6.4py3-none-any.whl → 3.6.6py3-none-any.whl