deepeval 3.6.5__py3-none-any.whl ā 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info ā deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info ā deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info ā deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info ā deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info ā deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py
CHANGED
|
@@ -1,24 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
import os
|
|
2
|
-
import warnings
|
|
3
5
|
import re
|
|
6
|
+
import warnings
|
|
4
7
|
|
|
5
|
-
# load environment variables before other imports
|
|
8
|
+
# IMPORTANT: load environment variables before other imports
|
|
6
9
|
from deepeval.config.settings import autoload_dotenv, get_settings
|
|
7
10
|
|
|
11
|
+
logging.getLogger("deepeval").addHandler(logging.NullHandler())
|
|
8
12
|
autoload_dotenv()
|
|
9
13
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
def _expose_public_api() -> None:
|
|
16
|
+
# All other imports must happen after env is loaded
|
|
17
|
+
# Do not do this at module level or ruff will complain with E402
|
|
18
|
+
global __version__, evaluate, assert_test, compare
|
|
19
|
+
global on_test_run_end, log_hyperparameters, login, telemetry
|
|
20
|
+
|
|
21
|
+
from ._version import __version__ as _version
|
|
22
|
+
from deepeval.evaluate import (
|
|
23
|
+
evaluate as _evaluate,
|
|
24
|
+
assert_test as _assert_test,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.evaluate.compare import compare as _compare
|
|
27
|
+
from deepeval.test_run import (
|
|
28
|
+
on_test_run_end as _on_end,
|
|
29
|
+
log_hyperparameters as _log_hparams,
|
|
30
|
+
)
|
|
31
|
+
from deepeval.utils import login as _login
|
|
32
|
+
import deepeval.telemetry as _telemetry
|
|
33
|
+
|
|
34
|
+
__version__ = _version
|
|
35
|
+
evaluate = _evaluate
|
|
36
|
+
assert_test = _assert_test
|
|
37
|
+
compare = _compare
|
|
38
|
+
on_test_run_end = _on_end
|
|
39
|
+
log_hyperparameters = _log_hparams
|
|
40
|
+
login = _login
|
|
41
|
+
telemetry = _telemetry
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_expose_public_api()
|
|
16
45
|
|
|
17
46
|
|
|
18
47
|
settings = get_settings()
|
|
48
|
+
|
|
19
49
|
if not settings.DEEPEVAL_GRPC_LOGGING:
|
|
20
|
-
os.
|
|
21
|
-
|
|
50
|
+
if os.getenv("GRPC_VERBOSITY") is None:
|
|
51
|
+
os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
|
|
52
|
+
if os.getenv("GRPC_TRACE") is None:
|
|
53
|
+
os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
|
|
22
54
|
|
|
23
55
|
|
|
24
56
|
__all__ = [
|
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.6.
|
|
1
|
+
__version__: str = "3.6.7"
|
|
@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
|
|
|
121
121
|
score = metric.measure(
|
|
122
122
|
LLMTestCase(input=golden.input, actual_output=prediction),
|
|
123
123
|
_show_indicator=False,
|
|
124
|
+
_log_metric_to_confident=False,
|
|
124
125
|
)
|
|
125
126
|
flipped_score = (
|
|
126
127
|
1 - metric.score if metric.score in [0, 1] else metric.score
|
deepeval/cli/main.py
CHANGED
|
@@ -328,6 +328,31 @@ def set_debug(
|
|
|
328
328
|
"--trace-flush/--no-trace-flush",
|
|
329
329
|
help="Enable / disable CONFIDENT_TRACE_FLUSH.",
|
|
330
330
|
),
|
|
331
|
+
trace_sample_rate: Optional[float] = typer.Option(
|
|
332
|
+
None,
|
|
333
|
+
"--trace-sample-rate",
|
|
334
|
+
help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
|
|
335
|
+
),
|
|
336
|
+
metric_logging_verbose: Optional[bool] = typer.Option(
|
|
337
|
+
None,
|
|
338
|
+
"--metric-logging-verbose/--no-metric-logging-verbose",
|
|
339
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
|
|
340
|
+
),
|
|
341
|
+
metric_logging_flush: Optional[bool] = typer.Option(
|
|
342
|
+
None,
|
|
343
|
+
"--metric-logging-flush/--no-metric-logging-flush",
|
|
344
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
|
|
345
|
+
),
|
|
346
|
+
metric_logging_sample_rate: Optional[float] = typer.Option(
|
|
347
|
+
None,
|
|
348
|
+
"--metric-logging-sample-rate",
|
|
349
|
+
help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
|
|
350
|
+
),
|
|
351
|
+
metric_logging_enabled: Optional[bool] = typer.Option(
|
|
352
|
+
None,
|
|
353
|
+
"--metric-logging-enabled/--no-metric-logging-enabled",
|
|
354
|
+
help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
|
|
355
|
+
),
|
|
331
356
|
# Advanced / potentially surprising
|
|
332
357
|
error_reporting: Optional[bool] = typer.Option(
|
|
333
358
|
None,
|
|
@@ -387,6 +412,20 @@ def set_debug(
|
|
|
387
412
|
settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
|
|
388
413
|
if trace_flush is not None:
|
|
389
414
|
settings.CONFIDENT_TRACE_FLUSH = trace_flush
|
|
415
|
+
if trace_sample_rate is not None:
|
|
416
|
+
settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
|
|
417
|
+
|
|
418
|
+
# Confident metrics
|
|
419
|
+
if metric_logging_verbose is not None:
|
|
420
|
+
settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
|
|
421
|
+
if metric_logging_flush is not None:
|
|
422
|
+
settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
|
|
423
|
+
if metric_logging_sample_rate is not None:
|
|
424
|
+
settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
|
|
425
|
+
metric_logging_sample_rate
|
|
426
|
+
)
|
|
427
|
+
if metric_logging_enabled is not None:
|
|
428
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
|
|
390
429
|
|
|
391
430
|
# Advanced
|
|
392
431
|
if error_reporting is not None:
|
|
@@ -438,6 +477,8 @@ def unset_debug(
|
|
|
438
477
|
settings.LOG_LEVEL = "info"
|
|
439
478
|
settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
|
|
440
479
|
settings.CONFIDENT_TRACE_VERBOSE = True
|
|
480
|
+
settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
|
|
481
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
|
|
441
482
|
|
|
442
483
|
# Clear optional toggles/overrides
|
|
443
484
|
settings.DEEPEVAL_VERBOSE_MODE = None
|
|
@@ -449,6 +490,7 @@ def unset_debug(
|
|
|
449
490
|
settings.GRPC_TRACE = None
|
|
450
491
|
|
|
451
492
|
settings.CONFIDENT_TRACE_FLUSH = None
|
|
493
|
+
settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
|
|
452
494
|
|
|
453
495
|
settings.ERROR_REPORTING = None
|
|
454
496
|
settings.IGNORE_DEEPEVAL_ERRORS = None
|
deepeval/confident/api.py
CHANGED
|
@@ -87,6 +87,7 @@ class Endpoints(Enum):
|
|
|
87
87
|
DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
|
|
88
88
|
|
|
89
89
|
TEST_RUN_ENDPOINT = "/v1/test-run"
|
|
90
|
+
METRIC_DATA_ENDPOINT = "/v1/metric-data"
|
|
90
91
|
TRACES_ENDPOINT = "/v1/traces"
|
|
91
92
|
ANNOTATIONS_ENDPOINT = "/v1/annotations"
|
|
92
93
|
PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal logging configuration helpers for DeepEval.
|
|
3
|
+
|
|
4
|
+
This module centralizes how the library-level logger ("deepeval") is configured. We
|
|
5
|
+
intentionally keep configuration lightweight so application code retains control
|
|
6
|
+
over handlers and formatters.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from deepeval.config.settings import get_settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_deepeval_log_level() -> None:
|
|
14
|
+
"""
|
|
15
|
+
Apply DeepEval's current log level to the package logger.
|
|
16
|
+
|
|
17
|
+
This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
|
|
18
|
+
and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
|
|
19
|
+
unset (None), INFO is used as a default. The logger's `propagate` flag is set
|
|
20
|
+
to True so records bubble up to the application's handlers. DeepEval does not
|
|
21
|
+
install its own handlers here (a NullHandler is attached in `__init__.py`).
|
|
22
|
+
|
|
23
|
+
The function is idempotent and safe to call multiple times. It is invoked
|
|
24
|
+
automatically when settings are first constructed and whenever `LOG_LEVEL`
|
|
25
|
+
is changed via `settings.edit`.
|
|
26
|
+
"""
|
|
27
|
+
settings = get_settings()
|
|
28
|
+
log_level = settings.LOG_LEVEL
|
|
29
|
+
logging.getLogger("deepeval").setLevel(
|
|
30
|
+
log_level if log_level is not None else logging.INFO
|
|
31
|
+
)
|
|
32
|
+
# ensure we bubble up to app handlers
|
|
33
|
+
logging.getLogger("deepeval").propagate = True
|
deepeval/config/settings.py
CHANGED
|
@@ -10,12 +10,20 @@ Central config for DeepEval.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
+
import math
|
|
13
14
|
import os
|
|
14
15
|
import re
|
|
15
16
|
|
|
16
17
|
from dotenv import dotenv_values
|
|
17
18
|
from pathlib import Path
|
|
18
|
-
from pydantic import
|
|
19
|
+
from pydantic import (
|
|
20
|
+
AnyUrl,
|
|
21
|
+
computed_field,
|
|
22
|
+
confloat,
|
|
23
|
+
conint,
|
|
24
|
+
field_validator,
|
|
25
|
+
SecretStr,
|
|
26
|
+
)
|
|
19
27
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
20
28
|
from typing import Any, Dict, List, Optional, NamedTuple
|
|
21
29
|
|
|
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
|
|
|
155
163
|
#
|
|
156
164
|
|
|
157
165
|
APP_ENV: str = "dev"
|
|
158
|
-
LOG_LEVEL:
|
|
166
|
+
LOG_LEVEL: Optional[int] = None
|
|
159
167
|
PYTHONPATH: str = "."
|
|
160
168
|
CONFIDENT_REGION: Optional[str] = None
|
|
161
169
|
CONFIDENT_OPEN_BROWSER: Optional[bool] = True
|
|
@@ -287,9 +295,33 @@ class Settings(BaseSettings):
|
|
|
287
295
|
#
|
|
288
296
|
# Retry Policy
|
|
289
297
|
#
|
|
290
|
-
|
|
291
|
-
|
|
298
|
+
# Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
|
|
299
|
+
# Key concepts:
|
|
300
|
+
# - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
|
|
301
|
+
# - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
|
|
302
|
+
# contribution is ~ JITTER/2 per sleep.
|
|
303
|
+
# - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
|
|
304
|
+
# the retry loggers will honor it without restart.
|
|
305
|
+
DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
|
|
306
|
+
None # ["*"] to delegate all retries to SDKs
|
|
307
|
+
)
|
|
308
|
+
DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
|
|
309
|
+
None # default is LOG_LEVEL if set, else INFO
|
|
310
|
+
)
|
|
292
311
|
DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
|
|
312
|
+
DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
|
|
313
|
+
2 # attempts = first try + retries
|
|
314
|
+
)
|
|
315
|
+
DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
|
|
316
|
+
1.0 # first sleep before retry, if any
|
|
317
|
+
)
|
|
318
|
+
DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
|
|
319
|
+
2.0 # exponential growth factor for sleeps
|
|
320
|
+
)
|
|
321
|
+
DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
|
|
322
|
+
DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
|
|
323
|
+
5.0 # cap for each backoff sleep
|
|
324
|
+
)
|
|
293
325
|
|
|
294
326
|
#
|
|
295
327
|
# Telemetry and Debug
|
|
@@ -305,10 +337,17 @@ class Settings(BaseSettings):
|
|
|
305
337
|
SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
|
|
306
338
|
DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
|
|
307
339
|
ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
|
|
340
|
+
|
|
308
341
|
CONFIDENT_TRACE_FLUSH: Optional[bool] = None
|
|
309
342
|
CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
|
|
310
343
|
CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
|
|
311
|
-
|
|
344
|
+
CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
|
|
345
|
+
|
|
346
|
+
CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
|
|
347
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
|
|
348
|
+
CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
|
|
349
|
+
CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
|
|
350
|
+
|
|
312
351
|
OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
|
|
313
352
|
|
|
314
353
|
#
|
|
@@ -316,19 +355,93 @@ class Settings(BaseSettings):
|
|
|
316
355
|
#
|
|
317
356
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
318
357
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
358
|
+
# DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
|
|
359
|
+
# This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
|
|
360
|
+
# If you leave this at 0/None, the computed outer budget defaults to 180s.
|
|
361
|
+
DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
|
|
362
|
+
None # per-attempt timeout. Set 0/None to disable
|
|
363
|
+
)
|
|
319
364
|
|
|
320
365
|
#
|
|
321
|
-
# Async
|
|
366
|
+
# Async Document Pipelines
|
|
322
367
|
#
|
|
323
368
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
369
|
+
DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
|
|
370
|
+
|
|
371
|
+
#
|
|
372
|
+
# Async Task Configuration
|
|
373
|
+
#
|
|
374
|
+
DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
|
|
375
|
+
DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
|
|
376
|
+
# DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
|
|
377
|
+
# It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
|
|
378
|
+
# - OVERRIDE = None or 0 -> auto compute as:
|
|
379
|
+
# attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
|
|
380
|
+
# (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
|
|
381
|
+
# - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
|
|
382
|
+
# to permit the configured attempts/backoff.
|
|
383
|
+
#
|
|
384
|
+
# Tip:
|
|
385
|
+
# Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
|
|
386
|
+
# Leave the outer budget on auto unless you have very strict SLAs.
|
|
387
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
|
|
328
388
|
|
|
329
389
|
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
330
390
|
# Increase if many tasks are running concurrently
|
|
331
|
-
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS:
|
|
391
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
|
|
392
|
+
|
|
393
|
+
###################
|
|
394
|
+
# Computed Fields #
|
|
395
|
+
###################
|
|
396
|
+
|
|
397
|
+
def _calc_auto_outer_timeout(self) -> int:
|
|
398
|
+
"""Compute outer budget from per-attempt timeout + retries/backoff.
|
|
399
|
+
Never reference the computed property itself here.
|
|
400
|
+
"""
|
|
401
|
+
attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
|
|
402
|
+
timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
403
|
+
if timeout_seconds <= 0:
|
|
404
|
+
# No per-attempt timeout set -> default outer budget
|
|
405
|
+
return 180
|
|
406
|
+
|
|
407
|
+
sleeps = max(0, attempts - 1)
|
|
408
|
+
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
409
|
+
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
410
|
+
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
411
|
+
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
412
|
+
|
|
413
|
+
backoff = 0.0
|
|
414
|
+
for _ in range(sleeps):
|
|
415
|
+
backoff += min(cap, cur)
|
|
416
|
+
cur *= base
|
|
417
|
+
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
418
|
+
|
|
419
|
+
safety_overhead = 1.0
|
|
420
|
+
return int(
|
|
421
|
+
math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
@computed_field
|
|
425
|
+
@property
|
|
426
|
+
def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
|
|
427
|
+
"""If OVERRIDE is set (nonzero), return it; else return the derived budget."""
|
|
428
|
+
outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
429
|
+
if outer not in (None, 0):
|
|
430
|
+
# Warn if user-provided outer is likely to truncate retries
|
|
431
|
+
if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
|
|
432
|
+
min_needed = self._calc_auto_outer_timeout()
|
|
433
|
+
if int(outer) < min_needed:
|
|
434
|
+
if self.DEEPEVAL_VERBOSE_MODE:
|
|
435
|
+
logger.warning(
|
|
436
|
+
"Metric timeout (outer=%ss) is less than attempts Ć per-attempt "
|
|
437
|
+
"timeout + backoff (ā%ss). Retries may be cut short.",
|
|
438
|
+
int(outer),
|
|
439
|
+
min_needed,
|
|
440
|
+
)
|
|
441
|
+
return int(outer)
|
|
442
|
+
|
|
443
|
+
# Auto mode
|
|
444
|
+
return self._calc_auto_outer_timeout()
|
|
332
445
|
|
|
333
446
|
##############
|
|
334
447
|
# Validators #
|
|
@@ -384,7 +497,8 @@ class Settings(BaseSettings):
|
|
|
384
497
|
"OPENAI_COST_PER_INPUT_TOKEN",
|
|
385
498
|
"OPENAI_COST_PER_OUTPUT_TOKEN",
|
|
386
499
|
"TEMPERATURE",
|
|
387
|
-
"
|
|
500
|
+
"CONFIDENT_TRACE_SAMPLE_RATE",
|
|
501
|
+
"CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
|
|
388
502
|
mode="before",
|
|
389
503
|
)
|
|
390
504
|
@classmethod
|
|
@@ -396,13 +510,17 @@ class Settings(BaseSettings):
|
|
|
396
510
|
return None
|
|
397
511
|
return float(v)
|
|
398
512
|
|
|
399
|
-
@field_validator(
|
|
513
|
+
@field_validator(
|
|
514
|
+
"CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
|
|
515
|
+
)
|
|
400
516
|
@classmethod
|
|
401
517
|
def _validate_sample_rate(cls, v):
|
|
402
518
|
if v is None:
|
|
403
519
|
return None
|
|
404
520
|
if not (0.0 <= float(v) <= 1.0):
|
|
405
|
-
raise ValueError(
|
|
521
|
+
raise ValueError(
|
|
522
|
+
"CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
|
|
523
|
+
)
|
|
406
524
|
return float(v)
|
|
407
525
|
|
|
408
526
|
@field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")
|
|
@@ -474,7 +592,9 @@ class Settings(BaseSettings):
|
|
|
474
592
|
if s in SUPPORTED_PROVIDER_SLUGS:
|
|
475
593
|
normalized.append(s)
|
|
476
594
|
else:
|
|
477
|
-
if
|
|
595
|
+
if parse_bool(
|
|
596
|
+
os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
|
|
597
|
+
):
|
|
478
598
|
logger.warning("Unknown provider slug %r dropped", item)
|
|
479
599
|
|
|
480
600
|
if star:
|
|
@@ -487,6 +607,7 @@ class Settings(BaseSettings):
|
|
|
487
607
|
@field_validator(
|
|
488
608
|
"DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
|
|
489
609
|
"DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
|
|
610
|
+
"LOG_LEVEL",
|
|
490
611
|
mode="before",
|
|
491
612
|
)
|
|
492
613
|
@classmethod
|
|
@@ -524,6 +645,10 @@ class Settings(BaseSettings):
|
|
|
524
645
|
# Persistence support #
|
|
525
646
|
#######################
|
|
526
647
|
class _SettingsEditCtx:
|
|
648
|
+
COMPUTED_FIELDS: frozenset[str] = frozenset(
|
|
649
|
+
{"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
|
|
650
|
+
)
|
|
651
|
+
|
|
527
652
|
def __init__(
|
|
528
653
|
self,
|
|
529
654
|
settings: "Settings",
|
|
@@ -559,8 +684,11 @@ class Settings(BaseSettings):
|
|
|
559
684
|
# lazy import legacy JSON store deps
|
|
560
685
|
from deepeval.key_handler import KEY_FILE_HANDLER
|
|
561
686
|
|
|
687
|
+
model_fields = type(self._s).model_fields
|
|
688
|
+
# Exclude computed fields from persistence
|
|
689
|
+
|
|
562
690
|
# compute diff of changed fields
|
|
563
|
-
after = {k: getattr(self._s, k) for k in
|
|
691
|
+
after = {k: getattr(self._s, k) for k in model_fields}
|
|
564
692
|
|
|
565
693
|
before_norm = {
|
|
566
694
|
k: _normalize_for_env(v) for k, v in self._before.items()
|
|
@@ -570,12 +698,21 @@ class Settings(BaseSettings):
|
|
|
570
698
|
changed_keys = {
|
|
571
699
|
k for k in after_norm if after_norm[k] != before_norm.get(k)
|
|
572
700
|
}
|
|
701
|
+
changed_keys -= self.COMPUTED_FIELDS
|
|
702
|
+
|
|
573
703
|
if not changed_keys:
|
|
574
704
|
self.result = PersistResult(False, None, {})
|
|
575
705
|
return False
|
|
576
706
|
|
|
577
707
|
updates = {k: after[k] for k in changed_keys}
|
|
578
708
|
|
|
709
|
+
if "LOG_LEVEL" in updates:
|
|
710
|
+
from deepeval.config.logging import (
|
|
711
|
+
apply_deepeval_log_level,
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
apply_deepeval_log_level()
|
|
715
|
+
|
|
579
716
|
#
|
|
580
717
|
# .deepeval JSON support
|
|
581
718
|
#
|
|
@@ -681,4 +818,27 @@ def get_settings() -> Settings:
|
|
|
681
818
|
global _settings_singleton
|
|
682
819
|
if _settings_singleton is None:
|
|
683
820
|
_settings_singleton = Settings()
|
|
821
|
+
from deepeval.config.logging import apply_deepeval_log_level
|
|
822
|
+
|
|
823
|
+
apply_deepeval_log_level()
|
|
684
824
|
return _settings_singleton
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
828
|
+
"""
|
|
829
|
+
Drop the cached Settings singleton and rebuild it from the current process
|
|
830
|
+
environment.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
|
|
834
|
+
which merges .env values into os.environ (never overwriting
|
|
835
|
+
existing process env vars).
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
The fresh Settings instance.
|
|
839
|
+
"""
|
|
840
|
+
global _settings_singleton
|
|
841
|
+
if reload_dotenv:
|
|
842
|
+
autoload_dotenv()
|
|
843
|
+
_settings_singleton = None
|
|
844
|
+
return get_settings()
|
deepeval/constants.py
CHANGED
|
@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\nāØš Looking for a place for your LLM test data to live
|
|
|
9
9
|
|
|
10
10
|
CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
|
|
11
11
|
CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
|
|
12
|
-
|
|
12
|
+
CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
|
|
13
13
|
CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
|
|
14
14
|
CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
|
|
15
|
+
|
|
16
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
|
|
17
|
+
CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
|
|
18
|
+
CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
|
|
19
|
+
CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
|
|
20
|
+
|
|
21
|
+
|
|
15
22
|
CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
|
|
16
23
|
CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"
|
|
17
24
|
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -49,7 +49,7 @@ from deepeval.utils import (
|
|
|
49
49
|
from deepeval.test_run import (
|
|
50
50
|
global_test_run_manager,
|
|
51
51
|
)
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
from deepeval.tracing import trace_manager
|
|
54
54
|
from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
|
|
55
55
|
|
|
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
|
|
|
1248
1248
|
display_config.file_output_dir,
|
|
1249
1249
|
)
|
|
1250
1250
|
|
|
1251
|
-
#
|
|
1252
|
-
test_run = global_test_run_manager.get_test_run()
|
|
1253
|
-
if len(openai_test_case_pairs) > 0:
|
|
1254
|
-
raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
|
|
1255
|
-
test_run.hyperparameters = process_hyperparameters(
|
|
1256
|
-
raw_hyperparameters
|
|
1257
|
-
)
|
|
1258
|
-
|
|
1259
|
-
# clean up
|
|
1260
|
-
openai_test_case_pairs.clear()
|
|
1251
|
+
# save test run
|
|
1261
1252
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
1262
1253
|
|
|
1263
1254
|
# sandwich end trace for OTEL
|
deepeval/dataset/utils.py
CHANGED
deepeval/evaluate/evaluate.py
CHANGED
|
@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
|
|
|
28
28
|
from deepeval.dataset import Golden
|
|
29
29
|
from deepeval.prompt import Prompt
|
|
30
30
|
from deepeval.test_case.utils import check_valid_test_cases_type
|
|
31
|
-
from deepeval.test_run.hyperparameters import
|
|
31
|
+
from deepeval.test_run.hyperparameters import (
|
|
32
|
+
process_hyperparameters,
|
|
33
|
+
process_prompts,
|
|
34
|
+
)
|
|
32
35
|
from deepeval.test_run.test_run import TEMP_FILE_PATH
|
|
33
36
|
from deepeval.utils import (
|
|
34
37
|
get_or_create_event_loop,
|
|
@@ -267,6 +270,7 @@ def evaluate(
|
|
|
267
270
|
|
|
268
271
|
test_run = global_test_run_manager.get_test_run()
|
|
269
272
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
273
|
+
test_run.prompts = process_prompts(hyperparameters)
|
|
270
274
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
271
275
|
res = global_test_run_manager.wrap_up_test_run(
|
|
272
276
|
run_duration, display_table=False
|