deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deepeval/__init__.py +8 -5
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/drop/drop.py +2 -3
  4. deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
  5. deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
  6. deepeval/benchmarks/math_qa/math_qa.py +2 -2
  7. deepeval/benchmarks/mmlu/mmlu.py +2 -2
  8. deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
  9. deepeval/cli/main.py +561 -727
  10. deepeval/confident/api.py +30 -14
  11. deepeval/config/__init__.py +0 -0
  12. deepeval/config/settings.py +565 -0
  13. deepeval/config/settings_manager.py +133 -0
  14. deepeval/config/utils.py +86 -0
  15. deepeval/dataset/__init__.py +1 -0
  16. deepeval/dataset/dataset.py +70 -10
  17. deepeval/dataset/test_run_tracer.py +82 -0
  18. deepeval/dataset/utils.py +23 -0
  19. deepeval/integrations/pydantic_ai/__init__.py +2 -4
  20. deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
  21. deepeval/integrations/pydantic_ai/patcher.py +376 -0
  22. deepeval/key_handler.py +1 -0
  23. deepeval/metrics/answer_relevancy/template.py +7 -2
  24. deepeval/metrics/faithfulness/template.py +11 -8
  25. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  26. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  27. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  28. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  29. deepeval/models/llms/grok_model.py +1 -1
  30. deepeval/models/llms/kimi_model.py +1 -1
  31. deepeval/models/llms/openai_model.py +37 -41
  32. deepeval/models/retry_policy.py +280 -0
  33. deepeval/openai_agents/agent.py +4 -2
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/perf_epoch_bridge.py +4 -4
  38. deepeval/tracing/tracing.py +37 -16
  39. deepeval/tracing/utils.py +98 -1
  40. deepeval/utils.py +111 -70
  41. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
  42. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
  43. deepeval/env.py +0 -35
  44. deepeval/integrations/pydantic_ai/agent.py +0 -364
  45. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
  46. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
  47. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,133 @@
1
+ """
2
+ Applies CLI driven updates to the live Settings and optionally persists them to a
3
+ dotenv file. Also syncs os.environ, handles unsets, and warns on unknown fields.
4
+ Primary entrypoint: update_settings_and_persist.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+
10
+ from difflib import get_close_matches
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
13
+ from enum import Enum
14
+
15
+ from pydantic import SecretStr
16
+ from deepeval.config.settings import get_settings, _SAVE_RE
17
+ from deepeval.cli.dotenv_handler import DotenvHandler
18
+ from deepeval.utils import bool_to_env_str
19
+
20
+ logger = logging.getLogger(__name__)
21
+ StrOrEnum = Union[str, Enum]
22
+
23
+
24
+ def _env_key(k: StrOrEnum) -> str:
25
+ return k.value if isinstance(k, Enum) else str(k)
26
+
27
+
28
+ def _normalize_for_env(val: Any) -> Optional[str]:
29
+ """Convert typed value to string for dotenv + os.environ; None -> unset."""
30
+ if val is None:
31
+ return None
32
+ if isinstance(val, SecretStr):
33
+ return val.get_secret_value()
34
+ if isinstance(val, bool):
35
+ return bool_to_env_str(val)
36
+ return str(val)
37
+
38
+
39
+ def _resolve_save_path(save_opt: Optional[str]) -> Tuple[bool, Optional[Path]]:
40
+ """
41
+ Returns (ok, path).
42
+ - ok=False -> invalid save option format
43
+ - ok=True, path=None -> no persistence requested
44
+ - ok=True, path=Path -> persist to that file
45
+ """
46
+ raw = (
47
+ save_opt if save_opt is not None else os.getenv("DEEPEVAL_DEFAULT_SAVE")
48
+ )
49
+ if not raw:
50
+ return True, None
51
+ m = _SAVE_RE.match(raw.strip())
52
+ if not m:
53
+ return False, None
54
+ path = m.group("path") or ".env.local"
55
+ path = Path(os.path.expanduser(os.path.expandvars(path)))
56
+ return True, path
57
+
58
+
59
+ def update_settings_and_persist(
60
+ updates: Mapping[StrOrEnum, Any],
61
+ *,
62
+ save: Optional[str] = None,
63
+ unset: Iterable[StrOrEnum] = (),
64
+ persist_dotenv: bool = True,
65
+ ) -> Tuple[bool, Optional[Path]]:
66
+ """
67
+ Write and update:
68
+ - validate + assign into live Settings()
69
+ - update os.environ
70
+ - persist to dotenv, if `save` or DEEPEVAL_DEFAULT_SAVE provided
71
+ - unset keys where value is None or explicitly in `unset`
72
+ Returns (handled, path_to_dotenv_if_any).
73
+ """
74
+ settings = get_settings()
75
+
76
+ # validate + assign into settings.
77
+ # validation is handled in Settings as long as validate_assignment=True
78
+ typed: Dict[str, Any] = {}
79
+ for key, value in updates.items():
80
+ k = _env_key(key)
81
+ if k not in type(settings).model_fields:
82
+ suggestion = get_close_matches(
83
+ k, type(settings).model_fields.keys(), n=1
84
+ )
85
+ if suggestion:
86
+ logger.warning(
87
+ "Unknown settings field '%s'; did you mean '%s'? Ignoring.",
88
+ k,
89
+ suggestion[0],
90
+ stacklevel=2,
91
+ )
92
+ else:
93
+ logger.warning(
94
+ "Unknown settings field '%s'; ignoring.", k, stacklevel=2
95
+ )
96
+ continue
97
+
98
+ setattr(settings, k, value)
99
+ # coercion is handled in Settings
100
+ typed[k] = getattr(settings, k)
101
+
102
+ # build env maps
103
+ to_write: Dict[str, str] = {}
104
+ to_unset: set[str] = set(_env_key(k) for k in unset)
105
+
106
+ for k, v in typed.items():
107
+ env_val = _normalize_for_env(v)
108
+ if env_val is None:
109
+ to_unset.add(k)
110
+ else:
111
+ to_write[k] = env_val
112
+
113
+ # update process env so that it is effective immediately
114
+ for k, v in to_write.items():
115
+ os.environ[k] = v
116
+ for k in to_unset:
117
+ os.environ.pop(k, None)
118
+
119
+ if not persist_dotenv:
120
+ return True, None
121
+
122
+ # persist to dotenv if save is ok
123
+ ok, path = _resolve_save_path(save)
124
+ if not ok:
125
+ return False, None # unsupported --save
126
+ if path:
127
+ h = DotenvHandler(path)
128
+ if to_write:
129
+ h.upsert(to_write)
130
+ if to_unset:
131
+ h.unset(to_unset)
132
+ return True, path
133
+ return True, None
@@ -0,0 +1,86 @@
1
+ import os
2
+ from typing import Any, Optional
3
+
4
+ _TRUTHY = frozenset({"1", "true", "t", "yes", "y", "on", "enable", "enabled"})
5
+ _FALSY = frozenset({"0", "false", "f", "no", "n", "off", "disable", "disabled"})
6
+
7
+
8
+ def parse_bool(value: Any, default: bool = False) -> bool:
9
+ """
10
+ Parse an arbitrary value into a boolean using env style semantics.
11
+
12
+ Truthy tokens (case-insensitive, quotes/whitespace ignored):
13
+ 1, true, t, yes, y, on, enable, enabled
14
+ Falsy tokens:
15
+ 0, false, f, no, n, off, disable, disabled
16
+
17
+ - bool -> returned as is
18
+ - None -> returns `default`
19
+ - int/float -> False if == 0, else True
20
+ - str/other -> matched against tokens above; non-matching -> `default`
21
+
22
+ Args:
23
+ value: Value to interpret.
24
+ default: Value to return if `value` is None or doesn’t match any token.
25
+
26
+ Returns:
27
+ The interpreted boolean.
28
+ """
29
+ if isinstance(value, bool):
30
+ return value
31
+ if value is None:
32
+ return default
33
+ if isinstance(value, (int, float)):
34
+ return value != 0
35
+
36
+ s = str(value).strip().strip('"').strip("'").lower()
37
+ if not s:
38
+ return default
39
+ if s in _TRUTHY:
40
+ return True
41
+ if s in _FALSY:
42
+ return False
43
+ return default
44
+
45
+
46
+ def get_env_bool(key: str, default: bool = False) -> bool:
47
+ """
48
+ Read an environment variable and parse it as a boolean using `parse_bool`.
49
+
50
+ Args:
51
+ key: Environment variable name.
52
+ default: Returned when the variable is unset or does not match any token.
53
+
54
+ Returns:
55
+ Parsed boolean value.
56
+ """
57
+ return parse_bool(os.getenv(key), default)
58
+
59
+
60
+ def bool_to_env_str(value: bool) -> str:
61
+ """
62
+ Canonicalize a boolean to the env/dotenv string form: "1" or "0".
63
+
64
+ Args:
65
+ value: Boolean to serialize.
66
+
67
+ Returns:
68
+ "1" if True, "0" if False.
69
+ """
70
+ return "1" if bool(value) else "0"
71
+
72
+
73
+ def set_env_bool(key: str, value: Optional[bool] = False) -> None:
74
+ """
75
+ Set an environment variable to a canonical boolean string ("1" or "0").
76
+
77
+ Args:
78
+ key: The environment variable name to set.
79
+ value: The boolean value to store. If None, it is treated as False.
80
+ True -> "1", False/None -> "0".
81
+
82
+ Notes:
83
+ - This function always overwrites the variable in `os.environ`.
84
+ - Use `get_env_bool` to read back and parse the value safely.
85
+ """
86
+ os.environ[key] = bool_to_env_str(bool(value))
@@ -1,4 +1,5 @@
1
1
  from .dataset import EvaluationDataset
2
2
  from .golden import Golden, ConversationalGolden
3
+ from .test_run_tracer import init_global_test_run_tracer
3
4
 
4
5
  __all__ = ["EvaluationDataset", "Golden", "ConversationalGolden"]
@@ -1,6 +1,8 @@
1
1
  from asyncio import Task
2
2
  from typing import Iterator, List, Optional, Union, Literal
3
3
  from dataclasses import dataclass, field
4
+ from opentelemetry.trace import Tracer
5
+ from opentelemetry.context import Context, attach, detach
4
6
  from rich.console import Console
5
7
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
6
8
  import json
@@ -10,6 +12,8 @@ import os
10
12
  import datetime
11
13
  import time
12
14
  import ast
15
+ import uuid
16
+ from opentelemetry import baggage
13
17
 
14
18
  from deepeval.confident.api import Api, Endpoints, HttpMethods
15
19
  from deepeval.dataset.utils import (
@@ -18,6 +22,7 @@ from deepeval.dataset.utils import (
18
22
  convert_convo_goldens_to_convo_test_cases,
19
23
  convert_convo_test_cases_to_convo_goldens,
20
24
  format_turns,
25
+ check_tracer,
21
26
  parse_turns,
22
27
  trimAndLoadJson,
23
28
  )
@@ -47,6 +52,7 @@ from deepeval.test_run import (
47
52
  from deepeval.dataset.types import global_evaluation_tasks
48
53
  from deepeval.openai.utils import openai_test_case_pairs
49
54
  from deepeval.tracing import trace_manager
55
+ from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
50
56
 
51
57
 
52
58
  valid_file_types = ["csv", "json", "jsonl"]
@@ -1097,6 +1103,7 @@ class EvaluationDataset:
1097
1103
  cache_config: Optional["CacheConfig"] = None,
1098
1104
  error_config: Optional["ErrorConfig"] = None,
1099
1105
  async_config: Optional["AsyncConfig"] = None,
1106
+ run_otel: Optional[bool] = False,
1100
1107
  ) -> Iterator[Golden]:
1101
1108
  from deepeval.evaluate.utils import (
1102
1109
  aggregate_metric_pass_rates,
@@ -1133,9 +1140,14 @@ class EvaluationDataset:
1133
1140
  start_time = time.perf_counter()
1134
1141
  test_results: List[TestResult] = []
1135
1142
 
1143
+ # sandwich start trace for OTEL
1144
+ if run_otel:
1145
+ ctx = self._start_otel_test_run() # ignored span
1146
+ ctx_token = attach(ctx)
1147
+
1136
1148
  if async_config.run_async:
1137
1149
  loop = get_or_create_event_loop()
1138
- yield from a_execute_agentic_test_cases_from_loop(
1150
+ for golden in a_execute_agentic_test_cases_from_loop(
1139
1151
  goldens=goldens,
1140
1152
  identifier=identifier,
1141
1153
  loop=loop,
@@ -1145,9 +1157,19 @@ class EvaluationDataset:
1145
1157
  cache_config=cache_config,
1146
1158
  error_config=error_config,
1147
1159
  async_config=async_config,
1148
- )
1160
+ ):
1161
+ if run_otel:
1162
+ _tracer = check_tracer()
1163
+ with _tracer.start_as_current_span(
1164
+ name=EVAL_DUMMY_SPAN_NAME,
1165
+ context=ctx,
1166
+ ):
1167
+ yield golden
1168
+ else:
1169
+ yield golden
1170
+
1149
1171
  else:
1150
- yield from execute_agentic_test_cases_from_loop(
1172
+ for golden in execute_agentic_test_cases_from_loop(
1151
1173
  goldens=goldens,
1152
1174
  trace_metrics=metrics,
1153
1175
  display_config=display_config,
@@ -1155,7 +1177,16 @@ class EvaluationDataset:
1155
1177
  error_config=error_config,
1156
1178
  test_results=test_results,
1157
1179
  identifier=identifier,
1158
- )
1180
+ ):
1181
+ if run_otel:
1182
+ _tracer = check_tracer()
1183
+ with _tracer.start_as_current_span(
1184
+ name=EVAL_DUMMY_SPAN_NAME,
1185
+ context=ctx,
1186
+ ):
1187
+ yield golden
1188
+ else:
1189
+ yield golden
1159
1190
 
1160
1191
  end_time = time.perf_counter()
1161
1192
  run_duration = end_time - start_time
@@ -1184,12 +1215,41 @@ class EvaluationDataset:
1184
1215
  # clean up
1185
1216
  openai_test_case_pairs.clear()
1186
1217
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
1187
- confident_link = global_test_run_manager.wrap_up_test_run(
1188
- run_duration, display_table=False
1189
- )
1190
- return EvaluationResult(
1191
- test_results=test_results, confident_link=confident_link
1192
- )
1218
+
1219
+ # sandwich end trace for OTEL
1220
+ if run_otel:
1221
+ self._end_otel_test_run(ctx)
1222
+ detach(ctx_token)
1223
+
1224
+ else:
1225
+ confident_link = global_test_run_manager.wrap_up_test_run(
1226
+ run_duration, display_table=False
1227
+ )
1228
+ return EvaluationResult(
1229
+ test_results=test_results, confident_link=confident_link
1230
+ )
1193
1231
 
1194
1232
  def evaluate(self, task: Task):
1195
1233
  global_evaluation_tasks.append(task)
1234
+
1235
+ def _start_otel_test_run(self, tracer: Optional[Tracer] = None) -> Context:
1236
+ _tracer = check_tracer(tracer)
1237
+ run_id = str(uuid.uuid4())
1238
+ print("Starting OTLP test run with run_id: ", run_id)
1239
+ ctx = baggage.set_baggage(
1240
+ "confident.test_run.id", run_id, context=Context()
1241
+ )
1242
+ with _tracer.start_as_current_span(
1243
+ "start_otel_test_run", context=ctx
1244
+ ) as span:
1245
+ span.set_attribute("confident.test_run.id", run_id)
1246
+ return ctx
1247
+
1248
+ def _end_otel_test_run(self, ctx: Context, tracer: Optional[Tracer] = None):
1249
+ run_id = baggage.get_baggage("confident.test_run.id", context=ctx)
1250
+ print("Ending OTLP test run with run_id: ", run_id)
1251
+ _tracer = check_tracer(tracer)
1252
+ with _tracer.start_as_current_span(
1253
+ "stop_otel_test_run", context=ctx
1254
+ ) as span:
1255
+ span.set_attribute("confident.test_run.id", run_id)
@@ -0,0 +1,82 @@
1
+ import os
2
+ from typing import Optional
3
+ from opentelemetry import baggage
4
+ from opentelemetry.trace import Tracer as OTelTracer
5
+ from opentelemetry.sdk.trace import SpanProcessor
6
+ from opentelemetry.sdk.trace import TracerProvider
7
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
8
+
9
+ try:
10
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
11
+ OTLPSpanExporter,
12
+ )
13
+
14
+ is_opentelemetry_installed = True
15
+ except Exception:
16
+ is_opentelemetry_installed = False
17
+
18
+
19
+ def is_opentelemetry_available():
20
+ if not is_opentelemetry_installed:
21
+ raise ImportError(
22
+ "OpenTelemetry SDK is not available. Please install it with `pip install opentelemetry-exporter-otlp-proto-http`."
23
+ )
24
+ return True
25
+
26
+
27
+ from deepeval.confident.api import get_confident_api_key
28
+
29
+ OTLP_ENDPOINT = (
30
+ os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
31
+ if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
32
+ else "https://otel.confident-ai.com"
33
+ )
34
+ # OTLP_ENDPOINT = "http://127.0.0.1:4318"
35
+
36
+ # Module-level globals to be imported and used by other code
37
+ GLOBAL_TEST_RUN_TRACER_PROVIDER: Optional[TracerProvider] = None
38
+ GLOBAL_TEST_RUN_TRACER: Optional[OTelTracer] = None
39
+
40
+
41
+ class RunIdSpanProcessor(SpanProcessor):
42
+ def on_start(self, span, parent_context):
43
+ run_id = baggage.get_baggage(
44
+ "confident.test_run.id", context=parent_context
45
+ )
46
+ if run_id:
47
+ span.set_attribute("confident.test_run.id", run_id)
48
+
49
+ def on_end(self, span) -> None: # type: ignore[override]
50
+ # No-op
51
+ return None
52
+
53
+ def shutdown(self) -> None: # type: ignore[override]
54
+ # No-op
55
+ return None
56
+
57
+ def force_flush(self, timeout_millis: int = 30000) -> bool: # type: ignore[override]
58
+ # No-op
59
+ return True
60
+
61
+
62
+ def init_global_test_run_tracer(api_key: Optional[str] = None):
63
+ is_opentelemetry_available()
64
+ api_key = get_confident_api_key()
65
+ if api_key is None:
66
+ raise ValueError("CONFIDENT_API_KEY is not set")
67
+
68
+ provider = TracerProvider()
69
+ exporter = OTLPSpanExporter(
70
+ endpoint=f"{OTLP_ENDPOINT}/v1/traces",
71
+ headers={"x-confident-api-key": api_key},
72
+ )
73
+ provider.add_span_processor(RunIdSpanProcessor())
74
+ provider.add_span_processor(BatchSpanProcessor(span_exporter=exporter))
75
+ tracer = provider.get_tracer("deepeval_tracer")
76
+
77
+ global GLOBAL_TEST_RUN_TRACER_PROVIDER
78
+ global GLOBAL_TEST_RUN_TRACER
79
+ GLOBAL_TEST_RUN_TRACER_PROVIDER = provider
80
+ GLOBAL_TEST_RUN_TRACER = tracer
81
+
82
+ return provider, tracer
deepeval/dataset/utils.py CHANGED
@@ -2,6 +2,10 @@ from typing import List, Optional, Any
2
2
  import json
3
3
  import re
4
4
 
5
+ from opentelemetry.trace import Tracer
6
+ from opentelemetry import trace
7
+ from opentelemetry.trace import NoOpTracerProvider
8
+
5
9
  from deepeval.dataset.api import Golden
6
10
  from deepeval.dataset.golden import ConversationalGolden
7
11
  from deepeval.test_case import LLMTestCase, ConversationalTestCase, Turn
@@ -151,3 +155,22 @@ def parse_turns(turns_str: str) -> List[Turn]:
151
155
  )
152
156
  )
153
157
  return res
158
+
159
+
160
+ def check_tracer(tracer: Optional[Tracer] = None) -> Tracer:
161
+ if tracer:
162
+ return tracer
163
+ # Prefer module-level test-run tracer if available
164
+ try:
165
+ from deepeval.dataset.test_run_tracer import (
166
+ GLOBAL_TEST_RUN_TRACER,
167
+ )
168
+
169
+ if GLOBAL_TEST_RUN_TRACER is not None:
170
+ return GLOBAL_TEST_RUN_TRACER
171
+ except Exception:
172
+ raise RuntimeError(
173
+ "No global OpenTelemetry tracer provider is configured." # TODO: link to docs
174
+ )
175
+
176
+ return GLOBAL_TEST_RUN_TRACER
@@ -1,5 +1,3 @@
1
- from .agent import PydanticAIAgent as Agent
2
- from .setup import instrument_pydantic_ai
1
+ from .patcher import instrument as instrument_pydantic_ai
3
2
 
4
-
5
- __all__ = ["Agent", "instrument_pydantic_ai"]
3
+ __all__ = ["instrument_pydantic_ai"]
@@ -31,14 +31,6 @@ def instrument_pydantic_ai(api_key: Optional[str] = None):
31
31
  with capture_tracing_integration("pydantic_ai"):
32
32
  is_opentelemetry_available()
33
33
 
34
- if api_key:
35
- deepeval.login(api_key)
36
-
37
- api_key = get_confident_api_key()
38
-
39
- if not api_key:
40
- raise ValueError("No api key provided.")
41
-
42
34
  # create a new tracer provider
43
35
  tracer_provider = TracerProvider()
44
36
  tracer_provider.add_span_processor(