deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deepeval/__init__.py +8 -7
  2. deepeval/_version.py +1 -1
  3. deepeval/cli/dotenv_handler.py +71 -0
  4. deepeval/cli/main.py +1021 -280
  5. deepeval/cli/utils.py +116 -2
  6. deepeval/confident/api.py +29 -14
  7. deepeval/config/__init__.py +0 -0
  8. deepeval/config/settings.py +565 -0
  9. deepeval/config/settings_manager.py +133 -0
  10. deepeval/config/utils.py +86 -0
  11. deepeval/dataset/__init__.py +1 -0
  12. deepeval/dataset/dataset.py +70 -10
  13. deepeval/dataset/test_run_tracer.py +82 -0
  14. deepeval/dataset/utils.py +23 -0
  15. deepeval/key_handler.py +64 -2
  16. deepeval/metrics/__init__.py +4 -1
  17. deepeval/metrics/answer_relevancy/template.py +7 -2
  18. deepeval/metrics/conversational_dag/__init__.py +7 -0
  19. deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
  20. deepeval/metrics/conversational_dag/nodes.py +931 -0
  21. deepeval/metrics/conversational_dag/templates.py +117 -0
  22. deepeval/metrics/dag/dag.py +13 -4
  23. deepeval/metrics/dag/graph.py +47 -15
  24. deepeval/metrics/dag/utils.py +103 -38
  25. deepeval/metrics/faithfulness/template.py +11 -8
  26. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  27. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  28. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  29. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  30. deepeval/models/llms/openai_model.py +37 -41
  31. deepeval/models/retry_policy.py +280 -0
  32. deepeval/openai_agents/agent.py +4 -2
  33. deepeval/synthesizer/chunking/doc_chunker.py +87 -51
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/tracing.py +37 -16
  38. deepeval/tracing/utils.py +98 -1
  39. deepeval/utils.py +111 -70
  40. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
  41. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
  42. deepeval/env.py +0 -35
  43. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
  44. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
  45. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,86 @@
1
+ import os
2
+ from typing import Any, Optional
3
+
4
+ _TRUTHY = frozenset({"1", "true", "t", "yes", "y", "on", "enable", "enabled"})
5
+ _FALSY = frozenset({"0", "false", "f", "no", "n", "off", "disable", "disabled"})
6
+
7
+
8
+ def parse_bool(value: Any, default: bool = False) -> bool:
9
+ """
10
+ Parse an arbitrary value into a boolean using env style semantics.
11
+
12
+ Truthy tokens (case-insensitive, quotes/whitespace ignored):
13
+ 1, true, t, yes, y, on, enable, enabled
14
+ Falsy tokens:
15
+ 0, false, f, no, n, off, disable, disabled
16
+
17
+ - bool -> returned as is
18
+ - None -> returns `default`
19
+ - int/float -> False if == 0, else True
20
+ - str/other -> matched against tokens above; non-matching -> `default`
21
+
22
+ Args:
23
+ value: Value to interpret.
24
+ default: Value to return if `value` is None or doesn’t match any token.
25
+
26
+ Returns:
27
+ The interpreted boolean.
28
+ """
29
+ if isinstance(value, bool):
30
+ return value
31
+ if value is None:
32
+ return default
33
+ if isinstance(value, (int, float)):
34
+ return value != 0
35
+
36
+ s = str(value).strip().strip('"').strip("'").lower()
37
+ if not s:
38
+ return default
39
+ if s in _TRUTHY:
40
+ return True
41
+ if s in _FALSY:
42
+ return False
43
+ return default
44
+
45
+
46
+ def get_env_bool(key: str, default: bool = False) -> bool:
47
+ """
48
+ Read an environment variable and parse it as a boolean using `parse_bool`.
49
+
50
+ Args:
51
+ key: Environment variable name.
52
+ default: Returned when the variable is unset or does not match any token.
53
+
54
+ Returns:
55
+ Parsed boolean value.
56
+ """
57
+ return parse_bool(os.getenv(key), default)
58
+
59
+
60
+ def bool_to_env_str(value: bool) -> str:
61
+ """
62
+ Canonicalize a boolean to the env/dotenv string form: "1" or "0".
63
+
64
+ Args:
65
+ value: Boolean to serialize.
66
+
67
+ Returns:
68
+ "1" if True, "0" if False.
69
+ """
70
+ return "1" if bool(value) else "0"
71
+
72
+
73
+ def set_env_bool(key: str, value: Optional[bool] = False) -> None:
74
+ """
75
+ Set an environment variable to a canonical boolean string ("1" or "0").
76
+
77
+ Args:
78
+ key: The environment variable name to set.
79
+ value: The boolean value to store. If None, it is treated as False.
80
+ True -> "1", False/None -> "0".
81
+
82
+ Notes:
83
+ - This function always overwrites the variable in `os.environ`.
84
+ - Use `get_env_bool` to read back and parse the value safely.
85
+ """
86
+ os.environ[key] = bool_to_env_str(bool(value))
@@ -1,4 +1,5 @@
1
1
  from .dataset import EvaluationDataset
2
2
  from .golden import Golden, ConversationalGolden
3
+ from .test_run_tracer import init_global_test_run_tracer
3
4
 
4
5
  __all__ = ["EvaluationDataset", "Golden", "ConversationalGolden"]
@@ -1,6 +1,8 @@
1
1
  from asyncio import Task
2
2
  from typing import Iterator, List, Optional, Union, Literal
3
3
  from dataclasses import dataclass, field
4
+ from opentelemetry.trace import Tracer
5
+ from opentelemetry.context import Context, attach, detach
4
6
  from rich.console import Console
5
7
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
6
8
  import json
@@ -10,6 +12,8 @@ import os
10
12
  import datetime
11
13
  import time
12
14
  import ast
15
+ import uuid
16
+ from opentelemetry import baggage
13
17
 
14
18
  from deepeval.confident.api import Api, Endpoints, HttpMethods
15
19
  from deepeval.dataset.utils import (
@@ -18,6 +22,7 @@ from deepeval.dataset.utils import (
18
22
  convert_convo_goldens_to_convo_test_cases,
19
23
  convert_convo_test_cases_to_convo_goldens,
20
24
  format_turns,
25
+ check_tracer,
21
26
  parse_turns,
22
27
  trimAndLoadJson,
23
28
  )
@@ -47,6 +52,7 @@ from deepeval.test_run import (
47
52
  from deepeval.dataset.types import global_evaluation_tasks
48
53
  from deepeval.openai.utils import openai_test_case_pairs
49
54
  from deepeval.tracing import trace_manager
55
+ from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
50
56
 
51
57
 
52
58
  valid_file_types = ["csv", "json", "jsonl"]
@@ -1097,6 +1103,7 @@ class EvaluationDataset:
1097
1103
  cache_config: Optional["CacheConfig"] = None,
1098
1104
  error_config: Optional["ErrorConfig"] = None,
1099
1105
  async_config: Optional["AsyncConfig"] = None,
1106
+ run_otel: Optional[bool] = False,
1100
1107
  ) -> Iterator[Golden]:
1101
1108
  from deepeval.evaluate.utils import (
1102
1109
  aggregate_metric_pass_rates,
@@ -1133,9 +1140,14 @@ class EvaluationDataset:
1133
1140
  start_time = time.perf_counter()
1134
1141
  test_results: List[TestResult] = []
1135
1142
 
1143
+ # sandwich start trace for OTEL
1144
+ if run_otel:
1145
+ ctx = self._start_otel_test_run() # ignored span
1146
+ ctx_token = attach(ctx)
1147
+
1136
1148
  if async_config.run_async:
1137
1149
  loop = get_or_create_event_loop()
1138
- yield from a_execute_agentic_test_cases_from_loop(
1150
+ for golden in a_execute_agentic_test_cases_from_loop(
1139
1151
  goldens=goldens,
1140
1152
  identifier=identifier,
1141
1153
  loop=loop,
@@ -1145,9 +1157,19 @@ class EvaluationDataset:
1145
1157
  cache_config=cache_config,
1146
1158
  error_config=error_config,
1147
1159
  async_config=async_config,
1148
- )
1160
+ ):
1161
+ if run_otel:
1162
+ _tracer = check_tracer()
1163
+ with _tracer.start_as_current_span(
1164
+ name=EVAL_DUMMY_SPAN_NAME,
1165
+ context=ctx,
1166
+ ):
1167
+ yield golden
1168
+ else:
1169
+ yield golden
1170
+
1149
1171
  else:
1150
- yield from execute_agentic_test_cases_from_loop(
1172
+ for golden in execute_agentic_test_cases_from_loop(
1151
1173
  goldens=goldens,
1152
1174
  trace_metrics=metrics,
1153
1175
  display_config=display_config,
@@ -1155,7 +1177,16 @@ class EvaluationDataset:
1155
1177
  error_config=error_config,
1156
1178
  test_results=test_results,
1157
1179
  identifier=identifier,
1158
- )
1180
+ ):
1181
+ if run_otel:
1182
+ _tracer = check_tracer()
1183
+ with _tracer.start_as_current_span(
1184
+ name=EVAL_DUMMY_SPAN_NAME,
1185
+ context=ctx,
1186
+ ):
1187
+ yield golden
1188
+ else:
1189
+ yield golden
1159
1190
 
1160
1191
  end_time = time.perf_counter()
1161
1192
  run_duration = end_time - start_time
@@ -1184,12 +1215,41 @@ class EvaluationDataset:
1184
1215
  # clean up
1185
1216
  openai_test_case_pairs.clear()
1186
1217
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
1187
- confident_link = global_test_run_manager.wrap_up_test_run(
1188
- run_duration, display_table=False
1189
- )
1190
- return EvaluationResult(
1191
- test_results=test_results, confident_link=confident_link
1192
- )
1218
+
1219
+ # sandwich end trace for OTEL
1220
+ if run_otel:
1221
+ self._end_otel_test_run(ctx)
1222
+ detach(ctx_token)
1223
+
1224
+ else:
1225
+ confident_link = global_test_run_manager.wrap_up_test_run(
1226
+ run_duration, display_table=False
1227
+ )
1228
+ return EvaluationResult(
1229
+ test_results=test_results, confident_link=confident_link
1230
+ )
1193
1231
 
1194
1232
  def evaluate(self, task: Task):
1195
1233
  global_evaluation_tasks.append(task)
1234
+
1235
+ def _start_otel_test_run(self, tracer: Optional[Tracer] = None) -> Context:
1236
+ _tracer = check_tracer(tracer)
1237
+ run_id = str(uuid.uuid4())
1238
+ print("Starting OTLP test run with run_id: ", run_id)
1239
+ ctx = baggage.set_baggage(
1240
+ "confident.test_run.id", run_id, context=Context()
1241
+ )
1242
+ with _tracer.start_as_current_span(
1243
+ "start_otel_test_run", context=ctx
1244
+ ) as span:
1245
+ span.set_attribute("confident.test_run.id", run_id)
1246
+ return ctx
1247
+
1248
+ def _end_otel_test_run(self, ctx: Context, tracer: Optional[Tracer] = None):
1249
+ run_id = baggage.get_baggage("confident.test_run.id", context=ctx)
1250
+ print("Ending OTLP test run with run_id: ", run_id)
1251
+ _tracer = check_tracer(tracer)
1252
+ with _tracer.start_as_current_span(
1253
+ "stop_otel_test_run", context=ctx
1254
+ ) as span:
1255
+ span.set_attribute("confident.test_run.id", run_id)
@@ -0,0 +1,82 @@
1
+ import os
2
+ from typing import Optional
3
+ from opentelemetry import baggage
4
+ from opentelemetry.trace import Tracer as OTelTracer
5
+ from opentelemetry.sdk.trace import SpanProcessor
6
+ from opentelemetry.sdk.trace import TracerProvider
7
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
8
+
9
+ try:
10
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
11
+ OTLPSpanExporter,
12
+ )
13
+
14
+ is_opentelemetry_installed = True
15
+ except Exception:
16
+ is_opentelemetry_installed = False
17
+
18
+
19
+ def is_opentelemetry_available():
20
+ if not is_opentelemetry_installed:
21
+ raise ImportError(
22
+ "OpenTelemetry SDK is not available. Please install it with `pip install opentelemetry-exporter-otlp-proto-http`."
23
+ )
24
+ return True
25
+
26
+
27
+ from deepeval.confident.api import get_confident_api_key
28
+
29
+ OTLP_ENDPOINT = (
30
+ os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
31
+ if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
32
+ else "https://otel.confident-ai.com"
33
+ )
34
+ # OTLP_ENDPOINT = "http://127.0.0.1:4318"
35
+
36
+ # Module-level globals to be imported and used by other code
37
+ GLOBAL_TEST_RUN_TRACER_PROVIDER: Optional[TracerProvider] = None
38
+ GLOBAL_TEST_RUN_TRACER: Optional[OTelTracer] = None
39
+
40
+
41
+ class RunIdSpanProcessor(SpanProcessor):
42
+ def on_start(self, span, parent_context):
43
+ run_id = baggage.get_baggage(
44
+ "confident.test_run.id", context=parent_context
45
+ )
46
+ if run_id:
47
+ span.set_attribute("confident.test_run.id", run_id)
48
+
49
+ def on_end(self, span) -> None: # type: ignore[override]
50
+ # No-op
51
+ return None
52
+
53
+ def shutdown(self) -> None: # type: ignore[override]
54
+ # No-op
55
+ return None
56
+
57
+ def force_flush(self, timeout_millis: int = 30000) -> bool: # type: ignore[override]
58
+ # No-op
59
+ return True
60
+
61
+
62
+ def init_global_test_run_tracer(api_key: Optional[str] = None):
63
+ is_opentelemetry_available()
64
+ api_key = get_confident_api_key()
65
+ if api_key is None:
66
+ raise ValueError("CONFIDENT_API_KEY is not set")
67
+
68
+ provider = TracerProvider()
69
+ exporter = OTLPSpanExporter(
70
+ endpoint=f"{OTLP_ENDPOINT}/v1/traces",
71
+ headers={"x-confident-api-key": api_key},
72
+ )
73
+ provider.add_span_processor(RunIdSpanProcessor())
74
+ provider.add_span_processor(BatchSpanProcessor(span_exporter=exporter))
75
+ tracer = provider.get_tracer("deepeval_tracer")
76
+
77
+ global GLOBAL_TEST_RUN_TRACER_PROVIDER
78
+ global GLOBAL_TEST_RUN_TRACER
79
+ GLOBAL_TEST_RUN_TRACER_PROVIDER = provider
80
+ GLOBAL_TEST_RUN_TRACER = tracer
81
+
82
+ return provider, tracer
deepeval/dataset/utils.py CHANGED
@@ -2,6 +2,10 @@ from typing import List, Optional, Any
2
2
  import json
3
3
  import re
4
4
 
5
+ from opentelemetry.trace import Tracer
6
+ from opentelemetry import trace
7
+ from opentelemetry.trace import NoOpTracerProvider
8
+
5
9
  from deepeval.dataset.api import Golden
6
10
  from deepeval.dataset.golden import ConversationalGolden
7
11
  from deepeval.test_case import LLMTestCase, ConversationalTestCase, Turn
@@ -151,3 +155,22 @@ def parse_turns(turns_str: str) -> List[Turn]:
151
155
  )
152
156
  )
153
157
  return res
158
+
159
+
160
+ def check_tracer(tracer: Optional[Tracer] = None) -> Tracer:
161
+ if tracer:
162
+ return tracer
163
+ # Prefer module-level test-run tracer if available
164
+ try:
165
+ from deepeval.dataset.test_run_tracer import (
166
+ GLOBAL_TEST_RUN_TRACER,
167
+ )
168
+
169
+ if GLOBAL_TEST_RUN_TRACER is not None:
170
+ return GLOBAL_TEST_RUN_TRACER
171
+ except Exception:
172
+ raise RuntimeError(
173
+ "No global OpenTelemetry tracer provider is configured." # TODO: link to docs
174
+ )
175
+
176
+ return GLOBAL_TEST_RUN_TRACER
deepeval/key_handler.py CHANGED
@@ -1,12 +1,42 @@
1
1
  """File for handling API key"""
2
2
 
3
+ import os
3
4
  import json
5
+ import logging
6
+
4
7
  from enum import Enum
5
8
  from typing import Union
6
9
 
7
10
  from .constants import KEY_FILE, HIDDEN_DIR
8
11
 
9
12
 
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ SECRET_KEYS = {
17
+ # General providers
18
+ "OPENAI_API_KEY",
19
+ "ANTHROPIC_API_KEY",
20
+ # Azure OpenAI
21
+ "AZURE_OPENAI_API_KEY",
22
+ # Google / Gemini
23
+ "GOOGLE_API_KEY",
24
+ # xAI Grok
25
+ "GROK_API_KEY",
26
+ # Moonshot
27
+ "MOONSHOT_API_KEY",
28
+ # DeepSeek
29
+ "DEEPSEEK_API_KEY",
30
+ # LiteLLM
31
+ "LITELLM_API_KEY",
32
+ # Local gateways (if any require keys)
33
+ "LOCAL_MODEL_API_KEY",
34
+ "LOCAL_EMBEDDING_API_KEY",
35
+ }
36
+
37
+ _WARNED_SECRET_KEYS = set()
38
+
39
+
10
40
  class KeyValues(Enum):
11
41
  # Confident AI
12
42
  API_KEY = "api_key"
@@ -50,6 +80,7 @@ class ModelKeyValues(Enum):
50
80
  OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
51
81
  OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
52
82
  OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
83
+ OPENAI_API_KEY = "OPENAI_API_KEY"
53
84
  # Moonshot
54
85
  USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
55
86
  MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
@@ -79,10 +110,21 @@ class KeyFileHandler:
79
110
  def __init__(self):
80
111
  self.data = {}
81
112
 
113
+ def _ensure_dir(self):
114
+ os.makedirs(HIDDEN_DIR, exist_ok=True)
115
+
82
116
  def write_key(
83
117
  self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues], value
84
118
  ):
85
119
  """Appends or updates data in the hidden file"""
120
+
121
+ # hard stop on secrets: never write to disk
122
+ if key.value in SECRET_KEYS:
123
+ logger.warning(
124
+ f"{key} is blacklisted, refusing to persist. Keep your secrets in .env or .env.local instead"
125
+ )
126
+ return
127
+
86
128
  try:
87
129
  with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
88
130
  # Load existing data
@@ -99,13 +141,15 @@ class KeyFileHandler:
99
141
  self.data[key.value] = value
100
142
 
101
143
  # Write the updated data back to the file
144
+ self._ensure_dir()
102
145
  with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
103
146
  json.dump(self.data, f)
104
147
 
105
148
  def fetch_data(
106
149
  self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
107
150
  ):
108
- """Fetches the data from the hidden file"""
151
+ """Fetches the data from the hidden file.
152
+ NOTE: secrets in this file are deprecated; prefer env/.env."""
109
153
  try:
110
154
  with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
111
155
  try:
@@ -116,7 +160,24 @@ class KeyFileHandler:
116
160
  except FileNotFoundError:
117
161
  # Handle the case when the file doesn't exist
118
162
  self.data = {}
119
- return self.data.get(key.value)
163
+
164
+ value = self.data.get(key.value)
165
+
166
+ # Deprecation: warn only if we're actually returning a secret
167
+ if (
168
+ value is not None
169
+ and key.value in SECRET_KEYS
170
+ and key.value not in _WARNED_SECRET_KEYS
171
+ ):
172
+ logger.warning(
173
+ f"Reading secret '{key.value}' from legacy {HIDDEN_DIR}/{KEY_FILE}. "
174
+ "Persisting API keys in plaintext is deprecated. "
175
+ "Move this to your environment (.env / .env.local). "
176
+ "This fallback will be removed in a future release."
177
+ )
178
+ _WARNED_SECRET_KEYS.add(key.value)
179
+
180
+ return value
120
181
 
121
182
  def remove_key(
122
183
  self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
@@ -130,6 +191,7 @@ class KeyFileHandler:
130
191
  # Handle corrupted JSON file
131
192
  self.data = {}
132
193
  self.data.pop(key.value, None) # Remove the key if it exists
194
+ self._ensure_dir()
133
195
  with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
134
196
  json.dump(self.data, f)
135
197
  except FileNotFoundError:
@@ -5,7 +5,8 @@ from .base_metric import (
5
5
  BaseArenaMetric,
6
6
  )
7
7
 
8
- from .dag.dag import DAGMetric
8
+ from .dag.dag import DAGMetric, DeepAcyclicGraph
9
+ from .conversational_dag.conversational_dag import ConversationalDAGMetric
9
10
  from .bias.bias import BiasMetric
10
11
  from .toxicity.toxicity import ToxicityMetric
11
12
  from .pii_leakage.pii_leakage import PIILeakageMetric
@@ -67,6 +68,8 @@ __all__ = [
67
68
  "ArenaGEval",
68
69
  "ConversationalGEval",
69
70
  "DAGMetric",
71
+ "DeepAcyclicGraph",
72
+ "ConversationalDAGMetric"
70
73
  # RAG metrics
71
74
  "AnswerRelevancyMetric",
72
75
  "FaithfulnessMetric",
@@ -37,7 +37,7 @@ JSON:
37
37
  Please generate a list of JSON with two keys: `verdict` and `reason`.
38
38
  The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
39
39
  The 'reason' is the reason for the verdict.
40
- Provide a 'reason' ONLY if the answer is 'no'.
40
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
41
41
  The provided statements are statements made in the actual output.
42
42
 
43
43
  **
@@ -53,7 +53,8 @@ Example statements:
53
53
  "Security features include fingerprint authentication and an encrypted SSD.",
54
54
  "Every purchase comes with a one-year warranty.",
55
55
  "24/7 customer support is included.",
56
- "Pineapples taste great on pizza."
56
+ "Pineapples taste great on pizza.",
57
+ "The laptop is a Dell XPS 13."
57
58
  ]
58
59
 
59
60
  Example JSON:
@@ -79,6 +80,10 @@ Example JSON:
79
80
  {{
80
81
  "verdict": "no",
81
82
  "reason": "The statement about pineapples on pizza is completely irrelevant to the input, which asks about laptop features."
83
+ }},
84
+ {{
85
+ "verdict": "idk",
86
+ "reason": "The statement about the laptop being a Dell XPS 13 is not directly relevant to the input, but could be used as a supporting point to address the input."
82
87
  }}
83
88
  ]
84
89
  }}
@@ -0,0 +1,7 @@
1
+ from .nodes import (
2
+ ConversationalBaseNode,
3
+ ConversationalVerdictNode,
4
+ ConversationalTaskNode,
5
+ ConversationalBinaryJudgementNode,
6
+ ConversationalNonBinaryJudgementNode,
7
+ )
@@ -0,0 +1,139 @@
1
+ from typing import Optional, Union
2
+ from deepeval.metrics import BaseConversationalMetric
3
+ from deepeval.test_case import (
4
+ ConversationalTestCase,
5
+ )
6
+ from deepeval.utils import get_or_create_event_loop
7
+ from deepeval.metrics.utils import (
8
+ check_conversational_test_case_params,
9
+ construct_verbose_logs,
10
+ initialize_model,
11
+ )
12
+ from deepeval.models import DeepEvalBaseLLM
13
+ from deepeval.metrics.indicator import metric_progress_indicator
14
+ from deepeval.metrics.g_eval.schema import *
15
+ from deepeval.metrics import DeepAcyclicGraph
16
+ from deepeval.metrics.dag.utils import (
17
+ is_valid_dag_from_roots,
18
+ extract_required_params,
19
+ copy_graph,
20
+ )
21
+
22
+
23
+ class ConversationalDAGMetric(BaseConversationalMetric):
24
+
25
+ def __init__(
26
+ self,
27
+ name: str,
28
+ dag: DeepAcyclicGraph,
29
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
30
+ threshold: float = 0.5,
31
+ include_reason: bool = True,
32
+ async_mode: bool = True,
33
+ strict_mode: bool = False,
34
+ verbose_mode: bool = False,
35
+ _include_dag_suffix: bool = True,
36
+ ):
37
+ if (
38
+ is_valid_dag_from_roots(
39
+ root_nodes=dag.root_nodes, multiturn=dag.multiturn
40
+ )
41
+ == False
42
+ ):
43
+ raise ValueError("Cycle detected in DAG graph.")
44
+
45
+ self._verbose_steps: List[str] = []
46
+ self.dag = copy_graph(dag)
47
+ self.name = name
48
+ self.model, self.using_native_model = initialize_model(model)
49
+ self.evaluation_model = self.model.get_model_name()
50
+ self.threshold = 1 if strict_mode else threshold
51
+ self.include_reason = include_reason
52
+ self.strict_mode = strict_mode
53
+ self.async_mode = async_mode
54
+ self.verbose_mode = verbose_mode
55
+ self._include_dag_suffix = _include_dag_suffix
56
+
57
+ def measure(
58
+ self,
59
+ test_case: ConversationalTestCase,
60
+ _show_indicator: bool = True,
61
+ _in_component: bool = False,
62
+ ) -> float:
63
+ check_conversational_test_case_params(
64
+ test_case,
65
+ extract_required_params(self.dag.root_nodes, multiturn=True),
66
+ self,
67
+ )
68
+
69
+ self.evaluation_cost = 0 if self.using_native_model else None
70
+ with metric_progress_indicator(
71
+ self, _show_indicator=_show_indicator, _in_component=_in_component
72
+ ):
73
+ if self.async_mode:
74
+ loop = get_or_create_event_loop()
75
+ loop.run_until_complete(
76
+ self.a_measure(
77
+ test_case,
78
+ _show_indicator=False,
79
+ _in_component=_in_component,
80
+ )
81
+ )
82
+ else:
83
+ self.dag._execute(metric=self, test_case=test_case)
84
+ self.success = self.is_successful()
85
+ self.verbose_logs = construct_verbose_logs(
86
+ self,
87
+ steps=[
88
+ *self._verbose_steps,
89
+ f"Score: {self.score}\nReason: {self.reason}",
90
+ ],
91
+ )
92
+ return self.score
93
+
94
+ async def a_measure(
95
+ self,
96
+ test_case: ConversationalTestCase,
97
+ _show_indicator: bool = True,
98
+ _in_component: bool = False,
99
+ ) -> float:
100
+ check_conversational_test_case_params(
101
+ test_case,
102
+ extract_required_params(self.dag.root_nodes, multiturn=True),
103
+ self,
104
+ )
105
+
106
+ self.evaluation_cost = 0 if self.using_native_model else None
107
+ with metric_progress_indicator(
108
+ self,
109
+ async_mode=True,
110
+ _show_indicator=_show_indicator,
111
+ _in_component=_in_component,
112
+ ):
113
+ await self.dag._a_execute(metric=self, test_case=test_case)
114
+ self.success = self.is_successful()
115
+ self.verbose_logs = construct_verbose_logs(
116
+ self,
117
+ steps=[
118
+ *self._verbose_steps,
119
+ f"Score: {self.score}\nReason: {self.reason}",
120
+ ],
121
+ )
122
+ return self.score
123
+
124
+ def is_successful(self) -> bool:
125
+ if self.error is not None:
126
+ self.success = False
127
+ else:
128
+ try:
129
+ self.success = self.score >= self.threshold
130
+ except:
131
+ self.success = False
132
+ return self.success
133
+
134
+ @property
135
+ def __name__(self):
136
+ if self._include_dag_suffix:
137
+ return f"{self.name} [ConversationalDAG]"
138
+ else:
139
+ return self.name