deepeval 3.4.7__py3-none-any.whl → 3.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -2
- deepeval/_version.py +1 -1
- deepeval/cli/dotenv_handler.py +71 -0
- deepeval/cli/main.py +1039 -132
- deepeval/cli/utils.py +116 -2
- deepeval/key_handler.py +63 -2
- deepeval/metrics/__init__.py +4 -1
- deepeval/metrics/conversational_dag/__init__.py +7 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
- deepeval/metrics/conversational_dag/nodes.py +931 -0
- deepeval/metrics/conversational_dag/templates.py +117 -0
- deepeval/metrics/dag/dag.py +13 -4
- deepeval/metrics/dag/graph.py +47 -15
- deepeval/metrics/dag/utils.py +103 -38
- deepeval/synthesizer/chunking/doc_chunker.py +87 -51
- {deepeval-3.4.7.dist-info → deepeval-3.4.8.dist-info}/METADATA +1 -1
- {deepeval-3.4.7.dist-info → deepeval-3.4.8.dist-info}/RECORD +20 -15
- {deepeval-3.4.7.dist-info → deepeval-3.4.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.8.dist-info}/WHEEL +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.8.dist-info}/entry_points.txt +0 -0
deepeval/cli/utils.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
2
4
|
import webbrowser
|
|
3
5
|
import pyfiglet
|
|
4
|
-
|
|
6
|
+
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from rich import print
|
|
10
|
+
from typing import Optional, Dict, Iterable, List, Tuple, Union
|
|
5
11
|
from opentelemetry.trace import Span
|
|
6
12
|
|
|
7
13
|
from deepeval.key_handler import (
|
|
@@ -14,8 +20,25 @@ from deepeval.test_run.test_run import (
|
|
|
14
20
|
global_test_run_manager,
|
|
15
21
|
)
|
|
16
22
|
from deepeval.confident.api import get_confident_api_key, set_confident_api_key
|
|
23
|
+
from deepeval.cli.dotenv_handler import DotenvHandler
|
|
17
24
|
|
|
25
|
+
|
|
26
|
+
StrOrEnum = Union[str, "Enum"]
|
|
18
27
|
PROD = "https://app.confident-ai.com"
|
|
28
|
+
# List all mutually exclusive USE_* keys
|
|
29
|
+
USE_MODEL_KEYS: List[ModelKeyValues | EmbeddingKeyValues] = [
|
|
30
|
+
ModelKeyValues.USE_OPENAI_MODEL,
|
|
31
|
+
ModelKeyValues.USE_AZURE_OPENAI,
|
|
32
|
+
ModelKeyValues.USE_LOCAL_MODEL,
|
|
33
|
+
ModelKeyValues.USE_GROK_MODEL,
|
|
34
|
+
ModelKeyValues.USE_MOONSHOT_MODEL,
|
|
35
|
+
ModelKeyValues.USE_DEEPSEEK_MODEL,
|
|
36
|
+
ModelKeyValues.USE_GEMINI_MODEL,
|
|
37
|
+
ModelKeyValues.USE_LITELLM,
|
|
38
|
+
EmbeddingKeyValues.USE_AZURE_OPENAI_EMBEDDING,
|
|
39
|
+
EmbeddingKeyValues.USE_LOCAL_EMBEDDINGS,
|
|
40
|
+
# MAINTENANCE: add more if new USE_* keys appear
|
|
41
|
+
]
|
|
19
42
|
|
|
20
43
|
|
|
21
44
|
def render_login_message():
|
|
@@ -65,3 +88,94 @@ def clear_evaluation_model_keys():
|
|
|
65
88
|
def clear_embedding_model_keys():
|
|
66
89
|
for key in EmbeddingKeyValues:
|
|
67
90
|
KEY_FILE_HANDLER.remove_key(key)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _to_str_key(k: StrOrEnum) -> str:
|
|
94
|
+
return k.value if hasattr(k, "value") else str(k)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _normalize_kv(updates: Dict[StrOrEnum, str]) -> Dict[str, str]:
|
|
98
|
+
return {_to_str_key(k): v for k, v in updates.items()}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _normalize_keys(keys: Iterable[StrOrEnum]) -> list[str]:
|
|
102
|
+
return [_to_str_key(k) for k in keys]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _parse_save_option(
|
|
106
|
+
save_opt: str | None, default_path: str = ".env.local"
|
|
107
|
+
) -> Tuple[bool, str | None]:
|
|
108
|
+
if not save_opt:
|
|
109
|
+
return False, None
|
|
110
|
+
kind, *rest = save_opt.split(":", 1)
|
|
111
|
+
if kind != "dotenv":
|
|
112
|
+
return False, None
|
|
113
|
+
path = rest[0] if rest else default_path
|
|
114
|
+
return True, path
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def resolve_save_target(save_opt: Optional[str]) -> Optional[str]:
|
|
118
|
+
"""
|
|
119
|
+
Returns a normalized save target string like 'dotenv:.env.local' or None.
|
|
120
|
+
Precedence:
|
|
121
|
+
1) --save=...
|
|
122
|
+
2) DEEPEVAL_DEFAULT_SAVE (opt-in project default)
|
|
123
|
+
3) None (no save)
|
|
124
|
+
"""
|
|
125
|
+
if save_opt:
|
|
126
|
+
return save_opt
|
|
127
|
+
|
|
128
|
+
env_default = os.getenv("DEEPEVAL_DEFAULT_SAVE")
|
|
129
|
+
if env_default and env_default.strip():
|
|
130
|
+
return env_default.strip()
|
|
131
|
+
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def save_environ_to_store(
|
|
136
|
+
save_opt: str | None, updates: Dict[StrOrEnum, str]
|
|
137
|
+
) -> Tuple[bool, str | None]:
|
|
138
|
+
"""
|
|
139
|
+
Save 'updates' into the selected store (currently only dotenv). Idempotent upsert.
|
|
140
|
+
Returns (handled, path).
|
|
141
|
+
"""
|
|
142
|
+
ok, path = _parse_save_option(save_opt)
|
|
143
|
+
if not ok:
|
|
144
|
+
return False, None
|
|
145
|
+
if updates:
|
|
146
|
+
DotenvHandler(path).upsert(_normalize_kv(updates))
|
|
147
|
+
return True, path
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def unset_environ_in_store(
|
|
151
|
+
save_opt: str | None, keys: Iterable[StrOrEnum]
|
|
152
|
+
) -> Tuple[bool, str | None]:
|
|
153
|
+
"""
|
|
154
|
+
Remove keys from the selected store (currently only dotenv).
|
|
155
|
+
Returns (handled, path).
|
|
156
|
+
"""
|
|
157
|
+
ok, path = _parse_save_option(save_opt)
|
|
158
|
+
if not ok:
|
|
159
|
+
return False, None
|
|
160
|
+
norm = _normalize_keys(keys)
|
|
161
|
+
if norm:
|
|
162
|
+
DotenvHandler(path).unset(norm)
|
|
163
|
+
return True, path
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def switch_model_provider(target: ModelKeyValues, save: str = None) -> None:
|
|
167
|
+
"""
|
|
168
|
+
Ensure exactly one USE_* model flag is set to "YES" and the rest to "NO",
|
|
169
|
+
both in the .deepeval json store and in a dotenv file (if save is provided).
|
|
170
|
+
"""
|
|
171
|
+
if target not in USE_MODEL_KEYS:
|
|
172
|
+
raise ValueError(f"{target} is not a recognized USE_* model key")
|
|
173
|
+
|
|
174
|
+
for key in USE_MODEL_KEYS:
|
|
175
|
+
value = "YES" if key == target else "NO"
|
|
176
|
+
KEY_FILE_HANDLER.write_key(key, value)
|
|
177
|
+
|
|
178
|
+
if save:
|
|
179
|
+
handled, path = save_environ_to_store(save, {key: value})
|
|
180
|
+
if not handled:
|
|
181
|
+
print("Unsupported --save option. Use --save=dotenv[:path].")
|
deepeval/key_handler.py
CHANGED
|
@@ -1,12 +1,42 @@
|
|
|
1
1
|
"""File for handling API key"""
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import json
|
|
5
|
+
import logging
|
|
6
|
+
|
|
4
7
|
from enum import Enum
|
|
5
8
|
from typing import Union
|
|
6
9
|
|
|
7
10
|
from .constants import KEY_FILE, HIDDEN_DIR
|
|
8
11
|
|
|
9
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SECRET_KEYS = {
|
|
17
|
+
# General providers
|
|
18
|
+
"OPENAI_API_KEY",
|
|
19
|
+
"ANTHROPIC_API_KEY",
|
|
20
|
+
# Azure OpenAI
|
|
21
|
+
"AZURE_OPENAI_API_KEY",
|
|
22
|
+
# Google / Gemini
|
|
23
|
+
"GOOGLE_API_KEY",
|
|
24
|
+
# xAI Grok
|
|
25
|
+
"GROK_API_KEY",
|
|
26
|
+
# Moonshot
|
|
27
|
+
"MOONSHOT_API_KEY",
|
|
28
|
+
# DeepSeek
|
|
29
|
+
"DEEPSEEK_API_KEY",
|
|
30
|
+
# LiteLLM
|
|
31
|
+
"LITELLM_API_KEY",
|
|
32
|
+
# Local gateways (if any require keys)
|
|
33
|
+
"LOCAL_MODEL_API_KEY",
|
|
34
|
+
"LOCAL_EMBEDDING_API_KEY",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
_WARNED_SECRET_KEYS = set()
|
|
38
|
+
|
|
39
|
+
|
|
10
40
|
class KeyValues(Enum):
|
|
11
41
|
# Confident AI
|
|
12
42
|
API_KEY = "api_key"
|
|
@@ -79,10 +109,21 @@ class KeyFileHandler:
|
|
|
79
109
|
def __init__(self):
|
|
80
110
|
self.data = {}
|
|
81
111
|
|
|
112
|
+
def _ensure_dir(self):
|
|
113
|
+
os.makedirs(HIDDEN_DIR, exist_ok=True)
|
|
114
|
+
|
|
82
115
|
def write_key(
|
|
83
116
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues], value
|
|
84
117
|
):
|
|
85
118
|
"""Appends or updates data in the hidden file"""
|
|
119
|
+
|
|
120
|
+
# hard stop on secrets: never write to disk
|
|
121
|
+
if key.value in SECRET_KEYS:
|
|
122
|
+
logger.warning(
|
|
123
|
+
f"{key} is blacklisted, refusing to persist. Keep your secrets in .env or .env.local instead"
|
|
124
|
+
)
|
|
125
|
+
return
|
|
126
|
+
|
|
86
127
|
try:
|
|
87
128
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
|
|
88
129
|
# Load existing data
|
|
@@ -99,13 +140,15 @@ class KeyFileHandler:
|
|
|
99
140
|
self.data[key.value] = value
|
|
100
141
|
|
|
101
142
|
# Write the updated data back to the file
|
|
143
|
+
self._ensure_dir()
|
|
102
144
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
|
|
103
145
|
json.dump(self.data, f)
|
|
104
146
|
|
|
105
147
|
def fetch_data(
|
|
106
148
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
|
|
107
149
|
):
|
|
108
|
-
"""Fetches the data from the hidden file
|
|
150
|
+
"""Fetches the data from the hidden file.
|
|
151
|
+
NOTE: secrets in this file are deprecated; prefer env/.env."""
|
|
109
152
|
try:
|
|
110
153
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
|
|
111
154
|
try:
|
|
@@ -116,7 +159,24 @@ class KeyFileHandler:
|
|
|
116
159
|
except FileNotFoundError:
|
|
117
160
|
# Handle the case when the file doesn't exist
|
|
118
161
|
self.data = {}
|
|
119
|
-
|
|
162
|
+
|
|
163
|
+
value = self.data.get(key.value)
|
|
164
|
+
|
|
165
|
+
# Deprecation: warn only if we're actually returning a secret
|
|
166
|
+
if (
|
|
167
|
+
value is not None
|
|
168
|
+
and key.value in SECRET_KEYS
|
|
169
|
+
and key.value not in _WARNED_SECRET_KEYS
|
|
170
|
+
):
|
|
171
|
+
logger.warning(
|
|
172
|
+
f"Reading secret '{key.value}' from legacy {HIDDEN_DIR}/{KEY_FILE}. "
|
|
173
|
+
"Persisting API keys in plaintext is deprecated. "
|
|
174
|
+
"Move this to your environment (.env / .env.local). "
|
|
175
|
+
"This fallback will be removed in a future release."
|
|
176
|
+
)
|
|
177
|
+
_WARNED_SECRET_KEYS.add(key.value)
|
|
178
|
+
|
|
179
|
+
return value
|
|
120
180
|
|
|
121
181
|
def remove_key(
|
|
122
182
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
|
|
@@ -130,6 +190,7 @@ class KeyFileHandler:
|
|
|
130
190
|
# Handle corrupted JSON file
|
|
131
191
|
self.data = {}
|
|
132
192
|
self.data.pop(key.value, None) # Remove the key if it exists
|
|
193
|
+
self._ensure_dir()
|
|
133
194
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
|
|
134
195
|
json.dump(self.data, f)
|
|
135
196
|
except FileNotFoundError:
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -5,7 +5,8 @@ from .base_metric import (
|
|
|
5
5
|
BaseArenaMetric,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
-
from .dag.dag import DAGMetric
|
|
8
|
+
from .dag.dag import DAGMetric, DeepAcyclicGraph
|
|
9
|
+
from .conversational_dag.conversational_dag import ConversationalDAGMetric
|
|
9
10
|
from .bias.bias import BiasMetric
|
|
10
11
|
from .toxicity.toxicity import ToxicityMetric
|
|
11
12
|
from .pii_leakage.pii_leakage import PIILeakageMetric
|
|
@@ -67,6 +68,8 @@ __all__ = [
|
|
|
67
68
|
"ArenaGEval",
|
|
68
69
|
"ConversationalGEval",
|
|
69
70
|
"DAGMetric",
|
|
71
|
+
"DeepAcyclicGraph",
|
|
72
|
+
"ConversationalDAGMetric"
|
|
70
73
|
# RAG metrics
|
|
71
74
|
"AnswerRelevancyMetric",
|
|
72
75
|
"FaithfulnessMetric",
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
3
|
+
from deepeval.test_case import (
|
|
4
|
+
ConversationalTestCase,
|
|
5
|
+
)
|
|
6
|
+
from deepeval.utils import get_or_create_event_loop
|
|
7
|
+
from deepeval.metrics.utils import (
|
|
8
|
+
check_conversational_test_case_params,
|
|
9
|
+
construct_verbose_logs,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
13
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
|
+
from deepeval.metrics.g_eval.schema import *
|
|
15
|
+
from deepeval.metrics import DeepAcyclicGraph
|
|
16
|
+
from deepeval.metrics.dag.utils import (
|
|
17
|
+
is_valid_dag_from_roots,
|
|
18
|
+
extract_required_params,
|
|
19
|
+
copy_graph,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConversationalDAGMetric(BaseConversationalMetric):
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
name: str,
|
|
28
|
+
dag: DeepAcyclicGraph,
|
|
29
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
30
|
+
threshold: float = 0.5,
|
|
31
|
+
include_reason: bool = True,
|
|
32
|
+
async_mode: bool = True,
|
|
33
|
+
strict_mode: bool = False,
|
|
34
|
+
verbose_mode: bool = False,
|
|
35
|
+
_include_dag_suffix: bool = True,
|
|
36
|
+
):
|
|
37
|
+
if (
|
|
38
|
+
is_valid_dag_from_roots(
|
|
39
|
+
root_nodes=dag.root_nodes, multiturn=dag.multiturn
|
|
40
|
+
)
|
|
41
|
+
== False
|
|
42
|
+
):
|
|
43
|
+
raise ValueError("Cycle detected in DAG graph.")
|
|
44
|
+
|
|
45
|
+
self._verbose_steps: List[str] = []
|
|
46
|
+
self.dag = copy_graph(dag)
|
|
47
|
+
self.name = name
|
|
48
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
49
|
+
self.evaluation_model = self.model.get_model_name()
|
|
50
|
+
self.threshold = 1 if strict_mode else threshold
|
|
51
|
+
self.include_reason = include_reason
|
|
52
|
+
self.strict_mode = strict_mode
|
|
53
|
+
self.async_mode = async_mode
|
|
54
|
+
self.verbose_mode = verbose_mode
|
|
55
|
+
self._include_dag_suffix = _include_dag_suffix
|
|
56
|
+
|
|
57
|
+
def measure(
|
|
58
|
+
self,
|
|
59
|
+
test_case: ConversationalTestCase,
|
|
60
|
+
_show_indicator: bool = True,
|
|
61
|
+
_in_component: bool = False,
|
|
62
|
+
) -> float:
|
|
63
|
+
check_conversational_test_case_params(
|
|
64
|
+
test_case,
|
|
65
|
+
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
66
|
+
self,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
70
|
+
with metric_progress_indicator(
|
|
71
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
72
|
+
):
|
|
73
|
+
if self.async_mode:
|
|
74
|
+
loop = get_or_create_event_loop()
|
|
75
|
+
loop.run_until_complete(
|
|
76
|
+
self.a_measure(
|
|
77
|
+
test_case,
|
|
78
|
+
_show_indicator=False,
|
|
79
|
+
_in_component=_in_component,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
self.dag._execute(metric=self, test_case=test_case)
|
|
84
|
+
self.success = self.is_successful()
|
|
85
|
+
self.verbose_logs = construct_verbose_logs(
|
|
86
|
+
self,
|
|
87
|
+
steps=[
|
|
88
|
+
*self._verbose_steps,
|
|
89
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
|
+
],
|
|
91
|
+
)
|
|
92
|
+
return self.score
|
|
93
|
+
|
|
94
|
+
async def a_measure(
|
|
95
|
+
self,
|
|
96
|
+
test_case: ConversationalTestCase,
|
|
97
|
+
_show_indicator: bool = True,
|
|
98
|
+
_in_component: bool = False,
|
|
99
|
+
) -> float:
|
|
100
|
+
check_conversational_test_case_params(
|
|
101
|
+
test_case,
|
|
102
|
+
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
103
|
+
self,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
107
|
+
with metric_progress_indicator(
|
|
108
|
+
self,
|
|
109
|
+
async_mode=True,
|
|
110
|
+
_show_indicator=_show_indicator,
|
|
111
|
+
_in_component=_in_component,
|
|
112
|
+
):
|
|
113
|
+
await self.dag._a_execute(metric=self, test_case=test_case)
|
|
114
|
+
self.success = self.is_successful()
|
|
115
|
+
self.verbose_logs = construct_verbose_logs(
|
|
116
|
+
self,
|
|
117
|
+
steps=[
|
|
118
|
+
*self._verbose_steps,
|
|
119
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
return self.score
|
|
123
|
+
|
|
124
|
+
def is_successful(self) -> bool:
|
|
125
|
+
if self.error is not None:
|
|
126
|
+
self.success = False
|
|
127
|
+
else:
|
|
128
|
+
try:
|
|
129
|
+
self.success = self.score >= self.threshold
|
|
130
|
+
except:
|
|
131
|
+
self.success = False
|
|
132
|
+
return self.success
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def __name__(self):
|
|
136
|
+
if self._include_dag_suffix:
|
|
137
|
+
return f"{self.name} [ConversationalDAG]"
|
|
138
|
+
else:
|
|
139
|
+
return self.name
|