deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py CHANGED
@@ -1,24 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  import os
2
- import warnings
3
5
  import re
6
+ import warnings
4
7
 
5
- # load environment variables before other imports
8
+ # IMPORTANT: load environment variables before other imports
6
9
  from deepeval.config.settings import autoload_dotenv, get_settings
7
10
 
11
+ logging.getLogger("deepeval").addHandler(logging.NullHandler())
8
12
  autoload_dotenv()
9
13
 
10
- from ._version import __version__
11
- from deepeval.evaluate import evaluate, assert_test
12
- from deepeval.evaluate.compare import compare
13
- from deepeval.test_run import on_test_run_end, log_hyperparameters
14
- from deepeval.utils import login
15
- from deepeval.telemetry import *
14
+
15
+ def _expose_public_api() -> None:
16
+ # All other imports must happen after env is loaded
17
+ # Do not do this at module level or ruff will complain with E402
18
+ global __version__, evaluate, assert_test, compare
19
+ global on_test_run_end, log_hyperparameters, login, telemetry
20
+
21
+ from ._version import __version__ as _version
22
+ from deepeval.evaluate import (
23
+ evaluate as _evaluate,
24
+ assert_test as _assert_test,
25
+ )
26
+ from deepeval.evaluate.compare import compare as _compare
27
+ from deepeval.test_run import (
28
+ on_test_run_end as _on_end,
29
+ log_hyperparameters as _log_hparams,
30
+ )
31
+ from deepeval.utils import login as _login
32
+ import deepeval.telemetry as _telemetry
33
+
34
+ __version__ = _version
35
+ evaluate = _evaluate
36
+ assert_test = _assert_test
37
+ compare = _compare
38
+ on_test_run_end = _on_end
39
+ log_hyperparameters = _log_hparams
40
+ login = _login
41
+ telemetry = _telemetry
42
+
43
+
44
+ _expose_public_api()
16
45
 
17
46
 
18
47
  settings = get_settings()
48
+
19
49
  if not settings.DEEPEVAL_GRPC_LOGGING:
20
- os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
21
- os.environ.setdefault("GRPC_TRACE", "")
50
+ if os.getenv("GRPC_VERBOSITY") is None:
51
+ os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
52
+ if os.getenv("GRPC_TRACE") is None:
53
+ os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
22
54
 
23
55
 
24
56
  __all__ = [
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.5"
1
+ __version__: str = "3.6.7"
@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
121
121
  score = metric.measure(
122
122
  LLMTestCase(input=golden.input, actual_output=prediction),
123
123
  _show_indicator=False,
124
+ _log_metric_to_confident=False,
124
125
  )
125
126
  flipped_score = (
126
127
  1 - metric.score if metric.score in [0, 1] else metric.score
deepeval/cli/main.py CHANGED
@@ -328,6 +328,31 @@ def set_debug(
328
328
  "--trace-flush/--no-trace-flush",
329
329
  help="Enable / disable CONFIDENT_TRACE_FLUSH.",
330
330
  ),
331
+ trace_sample_rate: Optional[float] = typer.Option(
332
+ None,
333
+ "--trace-sample-rate",
334
+ help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
335
+ ),
336
+ metric_logging_verbose: Optional[bool] = typer.Option(
337
+ None,
338
+ "--metric-logging-verbose/--no-metric-logging-verbose",
339
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
340
+ ),
341
+ metric_logging_flush: Optional[bool] = typer.Option(
342
+ None,
343
+ "--metric-logging-flush/--no-metric-logging-flush",
344
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
345
+ ),
346
+ metric_logging_sample_rate: Optional[float] = typer.Option(
347
+ None,
348
+ "--metric-logging-sample-rate",
349
+ help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
350
+ ),
351
+ metric_logging_enabled: Optional[bool] = typer.Option(
352
+ None,
353
+ "--metric-logging-enabled/--no-metric-logging-enabled",
354
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
355
+ ),
331
356
  # Advanced / potentially surprising
332
357
  error_reporting: Optional[bool] = typer.Option(
333
358
  None,
@@ -387,6 +412,20 @@ def set_debug(
387
412
  settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
388
413
  if trace_flush is not None:
389
414
  settings.CONFIDENT_TRACE_FLUSH = trace_flush
415
+ if trace_sample_rate is not None:
416
+ settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
417
+
418
+ # Confident metrics
419
+ if metric_logging_verbose is not None:
420
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
421
+ if metric_logging_flush is not None:
422
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
423
+ if metric_logging_sample_rate is not None:
424
+ settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
425
+ metric_logging_sample_rate
426
+ )
427
+ if metric_logging_enabled is not None:
428
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
390
429
 
391
430
  # Advanced
392
431
  if error_reporting is not None:
@@ -438,6 +477,8 @@ def unset_debug(
438
477
  settings.LOG_LEVEL = "info"
439
478
  settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
440
479
  settings.CONFIDENT_TRACE_VERBOSE = True
480
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
481
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
441
482
 
442
483
  # Clear optional toggles/overrides
443
484
  settings.DEEPEVAL_VERBOSE_MODE = None
@@ -449,6 +490,7 @@ def unset_debug(
449
490
  settings.GRPC_TRACE = None
450
491
 
451
492
  settings.CONFIDENT_TRACE_FLUSH = None
493
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
452
494
 
453
495
  settings.ERROR_REPORTING = None
454
496
  settings.IGNORE_DEEPEVAL_ERRORS = None
deepeval/confident/api.py CHANGED
@@ -87,6 +87,7 @@ class Endpoints(Enum):
87
87
  DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
88
88
 
89
89
  TEST_RUN_ENDPOINT = "/v1/test-run"
90
+ METRIC_DATA_ENDPOINT = "/v1/metric-data"
90
91
  TRACES_ENDPOINT = "/v1/traces"
91
92
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
93
  PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
@@ -0,0 +1,33 @@
1
+ """
2
+ Minimal logging configuration helpers for DeepEval.
3
+
4
+ This module centralizes how the library-level logger ("deepeval") is configured. We
5
+ intentionally keep configuration lightweight so application code retains control
6
+ over handlers and formatters.
7
+ """
8
+
9
+ import logging
10
+ from deepeval.config.settings import get_settings
11
+
12
+
13
+ def apply_deepeval_log_level() -> None:
14
+ """
15
+ Apply DeepEval's current log level to the package logger.
16
+
17
+ This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
18
+ and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
19
+ unset (None), INFO is used as a default. The logger's `propagate` flag is set
20
+ to True so records bubble up to the application's handlers. DeepEval does not
21
+ install its own handlers here (a NullHandler is attached in `__init__.py`).
22
+
23
+ The function is idempotent and safe to call multiple times. It is invoked
24
+ automatically when settings are first constructed and whenever `LOG_LEVEL`
25
+ is changed via `settings.edit`.
26
+ """
27
+ settings = get_settings()
28
+ log_level = settings.LOG_LEVEL
29
+ logging.getLogger("deepeval").setLevel(
30
+ log_level if log_level is not None else logging.INFO
31
+ )
32
+ # ensure we bubble up to app handlers
33
+ logging.getLogger("deepeval").propagate = True
@@ -10,12 +10,20 @@ Central config for DeepEval.
10
10
  """
11
11
 
12
12
  import logging
13
+ import math
13
14
  import os
14
15
  import re
15
16
 
16
17
  from dotenv import dotenv_values
17
18
  from pathlib import Path
18
- from pydantic import AnyUrl, SecretStr, field_validator, confloat
19
+ from pydantic import (
20
+ AnyUrl,
21
+ computed_field,
22
+ confloat,
23
+ conint,
24
+ field_validator,
25
+ SecretStr,
26
+ )
19
27
  from pydantic_settings import BaseSettings, SettingsConfigDict
20
28
  from typing import Any, Dict, List, Optional, NamedTuple
21
29
 
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
155
163
  #
156
164
 
157
165
  APP_ENV: str = "dev"
158
- LOG_LEVEL: str = "info"
166
+ LOG_LEVEL: Optional[int] = None
159
167
  PYTHONPATH: str = "."
160
168
  CONFIDENT_REGION: Optional[str] = None
161
169
  CONFIDENT_OPEN_BROWSER: Optional[bool] = True
@@ -287,9 +295,33 @@ class Settings(BaseSettings):
287
295
  #
288
296
  # Retry Policy
289
297
  #
290
- DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = None
291
- DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = None # default -> INFO
298
+ # Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
299
+ # Key concepts:
300
+ # - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
301
+ # - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
302
+ # contribution is ~ JITTER/2 per sleep.
303
+ # - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
304
+ # the retry loggers will honor it without restart.
305
+ DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
306
+ None # ["*"] to delegate all retries to SDKs
307
+ )
308
+ DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
309
+ None # default is LOG_LEVEL if set, else INFO
310
+ )
292
311
  DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
312
+ DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
313
+ 2 # attempts = first try + retries
314
+ )
315
+ DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
316
+ 1.0 # first sleep before retry, if any
317
+ )
318
+ DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
319
+ 2.0 # exponential growth factor for sleeps
320
+ )
321
+ DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
322
+ DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
323
+ 5.0 # cap for each backoff sleep
324
+ )
293
325
 
294
326
  #
295
327
  # Telemetry and Debug
@@ -305,10 +337,17 @@ class Settings(BaseSettings):
305
337
  SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
306
338
  DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
307
339
  ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
340
+
308
341
  CONFIDENT_TRACE_FLUSH: Optional[bool] = None
309
342
  CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
310
343
  CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
311
- CONFIDENT_SAMPLE_RATE: Optional[float] = 1.0
344
+ CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
345
+
346
+ CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
347
+ CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
348
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
349
+ CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
350
+
312
351
  OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
313
352
 
314
353
  #
@@ -316,19 +355,93 @@ class Settings(BaseSettings):
316
355
  #
317
356
  MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
318
357
  MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
358
+ # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
359
+ # This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
360
+ # If you leave this at 0/None, the computed outer budget defaults to 180s.
361
+ DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
362
+ None # per-attempt timeout. Set 0/None to disable
363
+ )
319
364
 
320
365
  #
321
- # Async Task Configuration
366
+ # Async Document Pipelines
322
367
  #
323
368
 
324
- # Maximum time allowed for a single task to complete
325
- DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
326
- 300 # Set to float('inf') to disable timeout
327
- )
369
+ DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
370
+
371
+ #
372
+ # Async Task Configuration
373
+ #
374
+ DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
375
+ DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
376
+ # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
377
+ # It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
378
+ # - OVERRIDE = None or 0 -> auto compute as:
379
+ # attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
380
+ # (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
381
+ # - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
382
+ # to permit the configured attempts/backoff.
383
+ #
384
+ # Tip:
385
+ # Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
386
+ # Leave the outer budget on auto unless you have very strict SLAs.
387
+ DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
328
388
 
329
389
  # Buffer time for gathering results from all tasks, added to the longest task duration
330
390
  # Increase if many tasks are running concurrently
331
- DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
391
+ DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
392
+
393
+ ###################
394
+ # Computed Fields #
395
+ ###################
396
+
397
+ def _calc_auto_outer_timeout(self) -> int:
398
+ """Compute outer budget from per-attempt timeout + retries/backoff.
399
+ Never reference the computed property itself here.
400
+ """
401
+ attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
402
+ timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
403
+ if timeout_seconds <= 0:
404
+ # No per-attempt timeout set -> default outer budget
405
+ return 180
406
+
407
+ sleeps = max(0, attempts - 1)
408
+ cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
409
+ cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
410
+ base = float(self.DEEPEVAL_RETRY_EXP_BASE)
411
+ jitter = float(self.DEEPEVAL_RETRY_JITTER)
412
+
413
+ backoff = 0.0
414
+ for _ in range(sleeps):
415
+ backoff += min(cap, cur)
416
+ cur *= base
417
+ backoff += sleeps * (jitter / 2.0) # expected jitter
418
+
419
+ safety_overhead = 1.0
420
+ return int(
421
+ math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
422
+ )
423
+
424
+ @computed_field
425
+ @property
426
+ def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
427
+ """If OVERRIDE is set (nonzero), return it; else return the derived budget."""
428
+ outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
429
+ if outer not in (None, 0):
430
+ # Warn if user-provided outer is likely to truncate retries
431
+ if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
432
+ min_needed = self._calc_auto_outer_timeout()
433
+ if int(outer) < min_needed:
434
+ if self.DEEPEVAL_VERBOSE_MODE:
435
+ logger.warning(
436
+ "Metric timeout (outer=%ss) is less than attempts Ɨ per-attempt "
437
+ "timeout + backoff (ā‰ˆ%ss). Retries may be cut short.",
438
+ int(outer),
439
+ min_needed,
440
+ )
441
+ return int(outer)
442
+
443
+ # Auto mode
444
+ return self._calc_auto_outer_timeout()
332
445
 
333
446
  ##############
334
447
  # Validators #
@@ -384,7 +497,8 @@ class Settings(BaseSettings):
384
497
  "OPENAI_COST_PER_INPUT_TOKEN",
385
498
  "OPENAI_COST_PER_OUTPUT_TOKEN",
386
499
  "TEMPERATURE",
387
- "CONFIDENT_SAMPLE_RATE",
500
+ "CONFIDENT_TRACE_SAMPLE_RATE",
501
+ "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
388
502
  mode="before",
389
503
  )
390
504
  @classmethod
@@ -396,13 +510,17 @@ class Settings(BaseSettings):
396
510
  return None
397
511
  return float(v)
398
512
 
399
- @field_validator("CONFIDENT_SAMPLE_RATE")
513
+ @field_validator(
514
+ "CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
515
+ )
400
516
  @classmethod
401
517
  def _validate_sample_rate(cls, v):
402
518
  if v is None:
403
519
  return None
404
520
  if not (0.0 <= float(v) <= 1.0):
405
- raise ValueError("CONFIDENT_SAMPLE_RATE must be between 0 and 1")
521
+ raise ValueError(
522
+ "CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
523
+ )
406
524
  return float(v)
407
525
 
408
526
  @field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")
@@ -474,7 +592,9 @@ class Settings(BaseSettings):
474
592
  if s in SUPPORTED_PROVIDER_SLUGS:
475
593
  normalized.append(s)
476
594
  else:
477
- if cls.DEEPEVAL_VERBOSE_MODE:
595
+ if parse_bool(
596
+ os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
597
+ ):
478
598
  logger.warning("Unknown provider slug %r dropped", item)
479
599
 
480
600
  if star:
@@ -487,6 +607,7 @@ class Settings(BaseSettings):
487
607
  @field_validator(
488
608
  "DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
489
609
  "DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
610
+ "LOG_LEVEL",
490
611
  mode="before",
491
612
  )
492
613
  @classmethod
@@ -524,6 +645,10 @@ class Settings(BaseSettings):
524
645
  # Persistence support #
525
646
  #######################
526
647
  class _SettingsEditCtx:
648
+ COMPUTED_FIELDS: frozenset[str] = frozenset(
649
+ {"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
650
+ )
651
+
527
652
  def __init__(
528
653
  self,
529
654
  settings: "Settings",
@@ -559,8 +684,11 @@ class Settings(BaseSettings):
559
684
  # lazy import legacy JSON store deps
560
685
  from deepeval.key_handler import KEY_FILE_HANDLER
561
686
 
687
+ model_fields = type(self._s).model_fields
688
+ # Exclude computed fields from persistence
689
+
562
690
  # compute diff of changed fields
563
- after = {k: getattr(self._s, k) for k in type(self._s).model_fields}
691
+ after = {k: getattr(self._s, k) for k in model_fields}
564
692
 
565
693
  before_norm = {
566
694
  k: _normalize_for_env(v) for k, v in self._before.items()
@@ -570,12 +698,21 @@ class Settings(BaseSettings):
570
698
  changed_keys = {
571
699
  k for k in after_norm if after_norm[k] != before_norm.get(k)
572
700
  }
701
+ changed_keys -= self.COMPUTED_FIELDS
702
+
573
703
  if not changed_keys:
574
704
  self.result = PersistResult(False, None, {})
575
705
  return False
576
706
 
577
707
  updates = {k: after[k] for k in changed_keys}
578
708
 
709
+ if "LOG_LEVEL" in updates:
710
+ from deepeval.config.logging import (
711
+ apply_deepeval_log_level,
712
+ )
713
+
714
+ apply_deepeval_log_level()
715
+
579
716
  #
580
717
  # .deepeval JSON support
581
718
  #
@@ -681,4 +818,27 @@ def get_settings() -> Settings:
681
818
  global _settings_singleton
682
819
  if _settings_singleton is None:
683
820
  _settings_singleton = Settings()
821
+ from deepeval.config.logging import apply_deepeval_log_level
822
+
823
+ apply_deepeval_log_level()
684
824
  return _settings_singleton
825
+
826
+
827
+ def reset_settings(*, reload_dotenv: bool = False) -> Settings:
828
+ """
829
+ Drop the cached Settings singleton and rebuild it from the current process
830
+ environment.
831
+
832
+ Args:
833
+ reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
834
+ which merges .env values into os.environ (never overwriting
835
+ existing process env vars).
836
+
837
+ Returns:
838
+ The fresh Settings instance.
839
+ """
840
+ global _settings_singleton
841
+ if reload_dotenv:
842
+ autoload_dotenv()
843
+ _settings_singleton = None
844
+ return get_settings()
deepeval/constants.py CHANGED
@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\nāœØšŸ‘€ Looking for a place for your LLM test data to live
9
9
 
10
10
  CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
11
11
  CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
12
- CONFIDENT_SAMPLE_RATE = "CONFIDENT_SAMPLE_RATE"
12
+ CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
13
13
  CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
14
14
  CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
15
+
16
+ CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
17
+ CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
18
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
19
+ CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
20
+
21
+
15
22
  CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
16
23
  CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"
17
24
 
@@ -49,7 +49,7 @@ from deepeval.utils import (
49
49
  from deepeval.test_run import (
50
50
  global_test_run_manager,
51
51
  )
52
- from deepeval.openai.utils import openai_test_case_pairs
52
+
53
53
  from deepeval.tracing import trace_manager
54
54
  from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
55
55
 
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
1248
1248
  display_config.file_output_dir,
1249
1249
  )
1250
1250
 
1251
- # update hyperparameters
1252
- test_run = global_test_run_manager.get_test_run()
1253
- if len(openai_test_case_pairs) > 0:
1254
- raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
1255
- test_run.hyperparameters = process_hyperparameters(
1256
- raw_hyperparameters
1257
- )
1258
-
1259
- # clean up
1260
- openai_test_case_pairs.clear()
1251
+ # save test run
1261
1252
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
1262
1253
 
1263
1254
  # sandwich end trace for OTEL
deepeval/dataset/utils.py CHANGED
@@ -120,7 +120,7 @@ def format_turns(turns: List[Turn]) -> str:
120
120
  }
121
121
  res.append(cur_turn)
122
122
  try:
123
- return json.dumps(res)
123
+ return json.dumps(res, ensure_ascii=False)
124
124
  except Exception as e:
125
125
  raise ValueError(f"Error serializing turns: {e}")
126
126
 
@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
28
28
  from deepeval.dataset import Golden
29
29
  from deepeval.prompt import Prompt
30
30
  from deepeval.test_case.utils import check_valid_test_cases_type
31
- from deepeval.test_run.hyperparameters import process_hyperparameters
31
+ from deepeval.test_run.hyperparameters import (
32
+ process_hyperparameters,
33
+ process_prompts,
34
+ )
32
35
  from deepeval.test_run.test_run import TEMP_FILE_PATH
33
36
  from deepeval.utils import (
34
37
  get_or_create_event_loop,
@@ -267,6 +270,7 @@ def evaluate(
267
270
 
268
271
  test_run = global_test_run_manager.get_test_run()
269
272
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
273
+ test_run.prompts = process_prompts(hyperparameters)
270
274
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
275
  res = global_test_run_manager.wrap_up_test_run(
272
276
  run_duration, display_table=False