deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -7
- deepeval/_version.py +1 -1
- deepeval/cli/dotenv_handler.py +71 -0
- deepeval/cli/main.py +1021 -280
- deepeval/cli/utils.py +116 -2
- deepeval/confident/api.py +29 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/key_handler.py +64 -2
- deepeval/metrics/__init__.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/conversational_dag/__init__.py +7 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
- deepeval/metrics/conversational_dag/nodes.py +931 -0
- deepeval/metrics/conversational_dag/templates.py +117 -0
- deepeval/metrics/dag/dag.py +13 -4
- deepeval/metrics/dag/graph.py +47 -15
- deepeval/metrics/dag/utils.py +103 -38
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/synthesizer/chunking/doc_chunker.py +87 -51
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
- deepeval/env.py +0 -35
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
deepeval/config/utils.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
_TRUTHY = frozenset({"1", "true", "t", "yes", "y", "on", "enable", "enabled"})
|
|
5
|
+
_FALSY = frozenset({"0", "false", "f", "no", "n", "off", "disable", "disabled"})
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_bool(value: Any, default: bool = False) -> bool:
|
|
9
|
+
"""
|
|
10
|
+
Parse an arbitrary value into a boolean using env style semantics.
|
|
11
|
+
|
|
12
|
+
Truthy tokens (case-insensitive, quotes/whitespace ignored):
|
|
13
|
+
1, true, t, yes, y, on, enable, enabled
|
|
14
|
+
Falsy tokens:
|
|
15
|
+
0, false, f, no, n, off, disable, disabled
|
|
16
|
+
|
|
17
|
+
- bool -> returned as is
|
|
18
|
+
- None -> returns `default`
|
|
19
|
+
- int/float -> False if == 0, else True
|
|
20
|
+
- str/other -> matched against tokens above; non-matching -> `default`
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
value: Value to interpret.
|
|
24
|
+
default: Value to return if `value` is None or doesn’t match any token.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
The interpreted boolean.
|
|
28
|
+
"""
|
|
29
|
+
if isinstance(value, bool):
|
|
30
|
+
return value
|
|
31
|
+
if value is None:
|
|
32
|
+
return default
|
|
33
|
+
if isinstance(value, (int, float)):
|
|
34
|
+
return value != 0
|
|
35
|
+
|
|
36
|
+
s = str(value).strip().strip('"').strip("'").lower()
|
|
37
|
+
if not s:
|
|
38
|
+
return default
|
|
39
|
+
if s in _TRUTHY:
|
|
40
|
+
return True
|
|
41
|
+
if s in _FALSY:
|
|
42
|
+
return False
|
|
43
|
+
return default
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_env_bool(key: str, default: bool = False) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Read an environment variable and parse it as a boolean using `parse_bool`.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
key: Environment variable name.
|
|
52
|
+
default: Returned when the variable is unset or does not match any token.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Parsed boolean value.
|
|
56
|
+
"""
|
|
57
|
+
return parse_bool(os.getenv(key), default)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def bool_to_env_str(value: bool) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Canonicalize a boolean to the env/dotenv string form: "1" or "0".
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
value: Boolean to serialize.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
"1" if True, "0" if False.
|
|
69
|
+
"""
|
|
70
|
+
return "1" if bool(value) else "0"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def set_env_bool(key: str, value: Optional[bool] = False) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Set an environment variable to a canonical boolean string ("1" or "0").
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
key: The environment variable name to set.
|
|
79
|
+
value: The boolean value to store. If None, it is treated as False.
|
|
80
|
+
True -> "1", False/None -> "0".
|
|
81
|
+
|
|
82
|
+
Notes:
|
|
83
|
+
- This function always overwrites the variable in `os.environ`.
|
|
84
|
+
- Use `get_env_bool` to read back and parse the value safely.
|
|
85
|
+
"""
|
|
86
|
+
os.environ[key] = bool_to_env_str(bool(value))
|
deepeval/dataset/__init__.py
CHANGED
deepeval/dataset/dataset.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from asyncio import Task
|
|
2
2
|
from typing import Iterator, List, Optional, Union, Literal
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from opentelemetry.trace import Tracer
|
|
5
|
+
from opentelemetry.context import Context, attach, detach
|
|
4
6
|
from rich.console import Console
|
|
5
7
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
6
8
|
import json
|
|
@@ -10,6 +12,8 @@ import os
|
|
|
10
12
|
import datetime
|
|
11
13
|
import time
|
|
12
14
|
import ast
|
|
15
|
+
import uuid
|
|
16
|
+
from opentelemetry import baggage
|
|
13
17
|
|
|
14
18
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
15
19
|
from deepeval.dataset.utils import (
|
|
@@ -18,6 +22,7 @@ from deepeval.dataset.utils import (
|
|
|
18
22
|
convert_convo_goldens_to_convo_test_cases,
|
|
19
23
|
convert_convo_test_cases_to_convo_goldens,
|
|
20
24
|
format_turns,
|
|
25
|
+
check_tracer,
|
|
21
26
|
parse_turns,
|
|
22
27
|
trimAndLoadJson,
|
|
23
28
|
)
|
|
@@ -47,6 +52,7 @@ from deepeval.test_run import (
|
|
|
47
52
|
from deepeval.dataset.types import global_evaluation_tasks
|
|
48
53
|
from deepeval.openai.utils import openai_test_case_pairs
|
|
49
54
|
from deepeval.tracing import trace_manager
|
|
55
|
+
from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
|
|
50
56
|
|
|
51
57
|
|
|
52
58
|
valid_file_types = ["csv", "json", "jsonl"]
|
|
@@ -1097,6 +1103,7 @@ class EvaluationDataset:
|
|
|
1097
1103
|
cache_config: Optional["CacheConfig"] = None,
|
|
1098
1104
|
error_config: Optional["ErrorConfig"] = None,
|
|
1099
1105
|
async_config: Optional["AsyncConfig"] = None,
|
|
1106
|
+
run_otel: Optional[bool] = False,
|
|
1100
1107
|
) -> Iterator[Golden]:
|
|
1101
1108
|
from deepeval.evaluate.utils import (
|
|
1102
1109
|
aggregate_metric_pass_rates,
|
|
@@ -1133,9 +1140,14 @@ class EvaluationDataset:
|
|
|
1133
1140
|
start_time = time.perf_counter()
|
|
1134
1141
|
test_results: List[TestResult] = []
|
|
1135
1142
|
|
|
1143
|
+
# sandwich start trace for OTEL
|
|
1144
|
+
if run_otel:
|
|
1145
|
+
ctx = self._start_otel_test_run() # ignored span
|
|
1146
|
+
ctx_token = attach(ctx)
|
|
1147
|
+
|
|
1136
1148
|
if async_config.run_async:
|
|
1137
1149
|
loop = get_or_create_event_loop()
|
|
1138
|
-
|
|
1150
|
+
for golden in a_execute_agentic_test_cases_from_loop(
|
|
1139
1151
|
goldens=goldens,
|
|
1140
1152
|
identifier=identifier,
|
|
1141
1153
|
loop=loop,
|
|
@@ -1145,9 +1157,19 @@ class EvaluationDataset:
|
|
|
1145
1157
|
cache_config=cache_config,
|
|
1146
1158
|
error_config=error_config,
|
|
1147
1159
|
async_config=async_config,
|
|
1148
|
-
)
|
|
1160
|
+
):
|
|
1161
|
+
if run_otel:
|
|
1162
|
+
_tracer = check_tracer()
|
|
1163
|
+
with _tracer.start_as_current_span(
|
|
1164
|
+
name=EVAL_DUMMY_SPAN_NAME,
|
|
1165
|
+
context=ctx,
|
|
1166
|
+
):
|
|
1167
|
+
yield golden
|
|
1168
|
+
else:
|
|
1169
|
+
yield golden
|
|
1170
|
+
|
|
1149
1171
|
else:
|
|
1150
|
-
|
|
1172
|
+
for golden in execute_agentic_test_cases_from_loop(
|
|
1151
1173
|
goldens=goldens,
|
|
1152
1174
|
trace_metrics=metrics,
|
|
1153
1175
|
display_config=display_config,
|
|
@@ -1155,7 +1177,16 @@ class EvaluationDataset:
|
|
|
1155
1177
|
error_config=error_config,
|
|
1156
1178
|
test_results=test_results,
|
|
1157
1179
|
identifier=identifier,
|
|
1158
|
-
)
|
|
1180
|
+
):
|
|
1181
|
+
if run_otel:
|
|
1182
|
+
_tracer = check_tracer()
|
|
1183
|
+
with _tracer.start_as_current_span(
|
|
1184
|
+
name=EVAL_DUMMY_SPAN_NAME,
|
|
1185
|
+
context=ctx,
|
|
1186
|
+
):
|
|
1187
|
+
yield golden
|
|
1188
|
+
else:
|
|
1189
|
+
yield golden
|
|
1159
1190
|
|
|
1160
1191
|
end_time = time.perf_counter()
|
|
1161
1192
|
run_duration = end_time - start_time
|
|
@@ -1184,12 +1215,41 @@ class EvaluationDataset:
|
|
|
1184
1215
|
# clean up
|
|
1185
1216
|
openai_test_case_pairs.clear()
|
|
1186
1217
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1218
|
+
|
|
1219
|
+
# sandwich end trace for OTEL
|
|
1220
|
+
if run_otel:
|
|
1221
|
+
self._end_otel_test_run(ctx)
|
|
1222
|
+
detach(ctx_token)
|
|
1223
|
+
|
|
1224
|
+
else:
|
|
1225
|
+
confident_link = global_test_run_manager.wrap_up_test_run(
|
|
1226
|
+
run_duration, display_table=False
|
|
1227
|
+
)
|
|
1228
|
+
return EvaluationResult(
|
|
1229
|
+
test_results=test_results, confident_link=confident_link
|
|
1230
|
+
)
|
|
1193
1231
|
|
|
1194
1232
|
def evaluate(self, task: Task):
|
|
1195
1233
|
global_evaluation_tasks.append(task)
|
|
1234
|
+
|
|
1235
|
+
def _start_otel_test_run(self, tracer: Optional[Tracer] = None) -> Context:
|
|
1236
|
+
_tracer = check_tracer(tracer)
|
|
1237
|
+
run_id = str(uuid.uuid4())
|
|
1238
|
+
print("Starting OTLP test run with run_id: ", run_id)
|
|
1239
|
+
ctx = baggage.set_baggage(
|
|
1240
|
+
"confident.test_run.id", run_id, context=Context()
|
|
1241
|
+
)
|
|
1242
|
+
with _tracer.start_as_current_span(
|
|
1243
|
+
"start_otel_test_run", context=ctx
|
|
1244
|
+
) as span:
|
|
1245
|
+
span.set_attribute("confident.test_run.id", run_id)
|
|
1246
|
+
return ctx
|
|
1247
|
+
|
|
1248
|
+
def _end_otel_test_run(self, ctx: Context, tracer: Optional[Tracer] = None):
|
|
1249
|
+
run_id = baggage.get_baggage("confident.test_run.id", context=ctx)
|
|
1250
|
+
print("Ending OTLP test run with run_id: ", run_id)
|
|
1251
|
+
_tracer = check_tracer(tracer)
|
|
1252
|
+
with _tracer.start_as_current_span(
|
|
1253
|
+
"stop_otel_test_run", context=ctx
|
|
1254
|
+
) as span:
|
|
1255
|
+
span.set_attribute("confident.test_run.id", run_id)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from opentelemetry import baggage
|
|
4
|
+
from opentelemetry.trace import Tracer as OTelTracer
|
|
5
|
+
from opentelemetry.sdk.trace import SpanProcessor
|
|
6
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
7
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
11
|
+
OTLPSpanExporter,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
is_opentelemetry_installed = True
|
|
15
|
+
except Exception:
|
|
16
|
+
is_opentelemetry_installed = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def is_opentelemetry_available():
|
|
20
|
+
if not is_opentelemetry_installed:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"OpenTelemetry SDK is not available. Please install it with `pip install opentelemetry-exporter-otlp-proto-http`."
|
|
23
|
+
)
|
|
24
|
+
return True
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
from deepeval.confident.api import get_confident_api_key
|
|
28
|
+
|
|
29
|
+
OTLP_ENDPOINT = (
|
|
30
|
+
os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
31
|
+
if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
32
|
+
else "https://otel.confident-ai.com"
|
|
33
|
+
)
|
|
34
|
+
# OTLP_ENDPOINT = "http://127.0.0.1:4318"
|
|
35
|
+
|
|
36
|
+
# Module-level globals to be imported and used by other code
|
|
37
|
+
GLOBAL_TEST_RUN_TRACER_PROVIDER: Optional[TracerProvider] = None
|
|
38
|
+
GLOBAL_TEST_RUN_TRACER: Optional[OTelTracer] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RunIdSpanProcessor(SpanProcessor):
|
|
42
|
+
def on_start(self, span, parent_context):
|
|
43
|
+
run_id = baggage.get_baggage(
|
|
44
|
+
"confident.test_run.id", context=parent_context
|
|
45
|
+
)
|
|
46
|
+
if run_id:
|
|
47
|
+
span.set_attribute("confident.test_run.id", run_id)
|
|
48
|
+
|
|
49
|
+
def on_end(self, span) -> None: # type: ignore[override]
|
|
50
|
+
# No-op
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
def shutdown(self) -> None: # type: ignore[override]
|
|
54
|
+
# No-op
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool: # type: ignore[override]
|
|
58
|
+
# No-op
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def init_global_test_run_tracer(api_key: Optional[str] = None):
|
|
63
|
+
is_opentelemetry_available()
|
|
64
|
+
api_key = get_confident_api_key()
|
|
65
|
+
if api_key is None:
|
|
66
|
+
raise ValueError("CONFIDENT_API_KEY is not set")
|
|
67
|
+
|
|
68
|
+
provider = TracerProvider()
|
|
69
|
+
exporter = OTLPSpanExporter(
|
|
70
|
+
endpoint=f"{OTLP_ENDPOINT}/v1/traces",
|
|
71
|
+
headers={"x-confident-api-key": api_key},
|
|
72
|
+
)
|
|
73
|
+
provider.add_span_processor(RunIdSpanProcessor())
|
|
74
|
+
provider.add_span_processor(BatchSpanProcessor(span_exporter=exporter))
|
|
75
|
+
tracer = provider.get_tracer("deepeval_tracer")
|
|
76
|
+
|
|
77
|
+
global GLOBAL_TEST_RUN_TRACER_PROVIDER
|
|
78
|
+
global GLOBAL_TEST_RUN_TRACER
|
|
79
|
+
GLOBAL_TEST_RUN_TRACER_PROVIDER = provider
|
|
80
|
+
GLOBAL_TEST_RUN_TRACER = tracer
|
|
81
|
+
|
|
82
|
+
return provider, tracer
|
deepeval/dataset/utils.py
CHANGED
|
@@ -2,6 +2,10 @@ from typing import List, Optional, Any
|
|
|
2
2
|
import json
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
|
+
from opentelemetry.trace import Tracer
|
|
6
|
+
from opentelemetry import trace
|
|
7
|
+
from opentelemetry.trace import NoOpTracerProvider
|
|
8
|
+
|
|
5
9
|
from deepeval.dataset.api import Golden
|
|
6
10
|
from deepeval.dataset.golden import ConversationalGolden
|
|
7
11
|
from deepeval.test_case import LLMTestCase, ConversationalTestCase, Turn
|
|
@@ -151,3 +155,22 @@ def parse_turns(turns_str: str) -> List[Turn]:
|
|
|
151
155
|
)
|
|
152
156
|
)
|
|
153
157
|
return res
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def check_tracer(tracer: Optional[Tracer] = None) -> Tracer:
|
|
161
|
+
if tracer:
|
|
162
|
+
return tracer
|
|
163
|
+
# Prefer module-level test-run tracer if available
|
|
164
|
+
try:
|
|
165
|
+
from deepeval.dataset.test_run_tracer import (
|
|
166
|
+
GLOBAL_TEST_RUN_TRACER,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if GLOBAL_TEST_RUN_TRACER is not None:
|
|
170
|
+
return GLOBAL_TEST_RUN_TRACER
|
|
171
|
+
except Exception:
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
"No global OpenTelemetry tracer provider is configured." # TODO: link to docs
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
return GLOBAL_TEST_RUN_TRACER
|
deepeval/key_handler.py
CHANGED
|
@@ -1,12 +1,42 @@
|
|
|
1
1
|
"""File for handling API key"""
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import json
|
|
5
|
+
import logging
|
|
6
|
+
|
|
4
7
|
from enum import Enum
|
|
5
8
|
from typing import Union
|
|
6
9
|
|
|
7
10
|
from .constants import KEY_FILE, HIDDEN_DIR
|
|
8
11
|
|
|
9
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SECRET_KEYS = {
|
|
17
|
+
# General providers
|
|
18
|
+
"OPENAI_API_KEY",
|
|
19
|
+
"ANTHROPIC_API_KEY",
|
|
20
|
+
# Azure OpenAI
|
|
21
|
+
"AZURE_OPENAI_API_KEY",
|
|
22
|
+
# Google / Gemini
|
|
23
|
+
"GOOGLE_API_KEY",
|
|
24
|
+
# xAI Grok
|
|
25
|
+
"GROK_API_KEY",
|
|
26
|
+
# Moonshot
|
|
27
|
+
"MOONSHOT_API_KEY",
|
|
28
|
+
# DeepSeek
|
|
29
|
+
"DEEPSEEK_API_KEY",
|
|
30
|
+
# LiteLLM
|
|
31
|
+
"LITELLM_API_KEY",
|
|
32
|
+
# Local gateways (if any require keys)
|
|
33
|
+
"LOCAL_MODEL_API_KEY",
|
|
34
|
+
"LOCAL_EMBEDDING_API_KEY",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
_WARNED_SECRET_KEYS = set()
|
|
38
|
+
|
|
39
|
+
|
|
10
40
|
class KeyValues(Enum):
|
|
11
41
|
# Confident AI
|
|
12
42
|
API_KEY = "api_key"
|
|
@@ -50,6 +80,7 @@ class ModelKeyValues(Enum):
|
|
|
50
80
|
OPENAI_MODEL_NAME = "OPENAI_MODEL_NAME"
|
|
51
81
|
OPENAI_COST_PER_INPUT_TOKEN = "OPENAI_COST_PER_INPUT_TOKEN"
|
|
52
82
|
OPENAI_COST_PER_OUTPUT_TOKEN = "OPENAI_COST_PER_OUTPUT_TOKEN"
|
|
83
|
+
OPENAI_API_KEY = "OPENAI_API_KEY"
|
|
53
84
|
# Moonshot
|
|
54
85
|
USE_MOONSHOT_MODEL = "USE_MOONSHOT_MODEL"
|
|
55
86
|
MOONSHOT_MODEL_NAME = "MOONSHOT_MODEL_NAME"
|
|
@@ -79,10 +110,21 @@ class KeyFileHandler:
|
|
|
79
110
|
def __init__(self):
|
|
80
111
|
self.data = {}
|
|
81
112
|
|
|
113
|
+
def _ensure_dir(self):
|
|
114
|
+
os.makedirs(HIDDEN_DIR, exist_ok=True)
|
|
115
|
+
|
|
82
116
|
def write_key(
|
|
83
117
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues], value
|
|
84
118
|
):
|
|
85
119
|
"""Appends or updates data in the hidden file"""
|
|
120
|
+
|
|
121
|
+
# hard stop on secrets: never write to disk
|
|
122
|
+
if key.value in SECRET_KEYS:
|
|
123
|
+
logger.warning(
|
|
124
|
+
f"{key} is blacklisted, refusing to persist. Keep your secrets in .env or .env.local instead"
|
|
125
|
+
)
|
|
126
|
+
return
|
|
127
|
+
|
|
86
128
|
try:
|
|
87
129
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
|
|
88
130
|
# Load existing data
|
|
@@ -99,13 +141,15 @@ class KeyFileHandler:
|
|
|
99
141
|
self.data[key.value] = value
|
|
100
142
|
|
|
101
143
|
# Write the updated data back to the file
|
|
144
|
+
self._ensure_dir()
|
|
102
145
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
|
|
103
146
|
json.dump(self.data, f)
|
|
104
147
|
|
|
105
148
|
def fetch_data(
|
|
106
149
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
|
|
107
150
|
):
|
|
108
|
-
"""Fetches the data from the hidden file
|
|
151
|
+
"""Fetches the data from the hidden file.
|
|
152
|
+
NOTE: secrets in this file are deprecated; prefer env/.env."""
|
|
109
153
|
try:
|
|
110
154
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "r") as f:
|
|
111
155
|
try:
|
|
@@ -116,7 +160,24 @@ class KeyFileHandler:
|
|
|
116
160
|
except FileNotFoundError:
|
|
117
161
|
# Handle the case when the file doesn't exist
|
|
118
162
|
self.data = {}
|
|
119
|
-
|
|
163
|
+
|
|
164
|
+
value = self.data.get(key.value)
|
|
165
|
+
|
|
166
|
+
# Deprecation: warn only if we're actually returning a secret
|
|
167
|
+
if (
|
|
168
|
+
value is not None
|
|
169
|
+
and key.value in SECRET_KEYS
|
|
170
|
+
and key.value not in _WARNED_SECRET_KEYS
|
|
171
|
+
):
|
|
172
|
+
logger.warning(
|
|
173
|
+
f"Reading secret '{key.value}' from legacy {HIDDEN_DIR}/{KEY_FILE}. "
|
|
174
|
+
"Persisting API keys in plaintext is deprecated. "
|
|
175
|
+
"Move this to your environment (.env / .env.local). "
|
|
176
|
+
"This fallback will be removed in a future release."
|
|
177
|
+
)
|
|
178
|
+
_WARNED_SECRET_KEYS.add(key.value)
|
|
179
|
+
|
|
180
|
+
return value
|
|
120
181
|
|
|
121
182
|
def remove_key(
|
|
122
183
|
self, key: Union[KeyValues, ModelKeyValues, EmbeddingKeyValues]
|
|
@@ -130,6 +191,7 @@ class KeyFileHandler:
|
|
|
130
191
|
# Handle corrupted JSON file
|
|
131
192
|
self.data = {}
|
|
132
193
|
self.data.pop(key.value, None) # Remove the key if it exists
|
|
194
|
+
self._ensure_dir()
|
|
133
195
|
with open(f"{HIDDEN_DIR}/{KEY_FILE}", "w") as f:
|
|
134
196
|
json.dump(self.data, f)
|
|
135
197
|
except FileNotFoundError:
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -5,7 +5,8 @@ from .base_metric import (
|
|
|
5
5
|
BaseArenaMetric,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
-
from .dag.dag import DAGMetric
|
|
8
|
+
from .dag.dag import DAGMetric, DeepAcyclicGraph
|
|
9
|
+
from .conversational_dag.conversational_dag import ConversationalDAGMetric
|
|
9
10
|
from .bias.bias import BiasMetric
|
|
10
11
|
from .toxicity.toxicity import ToxicityMetric
|
|
11
12
|
from .pii_leakage.pii_leakage import PIILeakageMetric
|
|
@@ -67,6 +68,8 @@ __all__ = [
|
|
|
67
68
|
"ArenaGEval",
|
|
68
69
|
"ConversationalGEval",
|
|
69
70
|
"DAGMetric",
|
|
71
|
+
"DeepAcyclicGraph",
|
|
72
|
+
"ConversationalDAGMetric"
|
|
70
73
|
# RAG metrics
|
|
71
74
|
"AnswerRelevancyMetric",
|
|
72
75
|
"FaithfulnessMetric",
|
|
@@ -37,7 +37,7 @@ JSON:
|
|
|
37
37
|
Please generate a list of JSON with two keys: `verdict` and `reason`.
|
|
38
38
|
The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
|
|
39
39
|
The 'reason' is the reason for the verdict.
|
|
40
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
40
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
41
41
|
The provided statements are statements made in the actual output.
|
|
42
42
|
|
|
43
43
|
**
|
|
@@ -53,7 +53,8 @@ Example statements:
|
|
|
53
53
|
"Security features include fingerprint authentication and an encrypted SSD.",
|
|
54
54
|
"Every purchase comes with a one-year warranty.",
|
|
55
55
|
"24/7 customer support is included.",
|
|
56
|
-
"Pineapples taste great on pizza."
|
|
56
|
+
"Pineapples taste great on pizza.",
|
|
57
|
+
"The laptop is a Dell XPS 13."
|
|
57
58
|
]
|
|
58
59
|
|
|
59
60
|
Example JSON:
|
|
@@ -79,6 +80,10 @@ Example JSON:
|
|
|
79
80
|
{{
|
|
80
81
|
"verdict": "no",
|
|
81
82
|
"reason": "The statement about pineapples on pizza is completely irrelevant to the input, which asks about laptop features."
|
|
83
|
+
}},
|
|
84
|
+
{{
|
|
85
|
+
"verdict": "idk",
|
|
86
|
+
"reason": "The statement about the laptop being a Dell XPS 13 is not directly relevant to the input, but could be used as a supporting point to address the input."
|
|
82
87
|
}}
|
|
83
88
|
]
|
|
84
89
|
}}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
3
|
+
from deepeval.test_case import (
|
|
4
|
+
ConversationalTestCase,
|
|
5
|
+
)
|
|
6
|
+
from deepeval.utils import get_or_create_event_loop
|
|
7
|
+
from deepeval.metrics.utils import (
|
|
8
|
+
check_conversational_test_case_params,
|
|
9
|
+
construct_verbose_logs,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
13
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
|
+
from deepeval.metrics.g_eval.schema import *
|
|
15
|
+
from deepeval.metrics import DeepAcyclicGraph
|
|
16
|
+
from deepeval.metrics.dag.utils import (
|
|
17
|
+
is_valid_dag_from_roots,
|
|
18
|
+
extract_required_params,
|
|
19
|
+
copy_graph,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConversationalDAGMetric(BaseConversationalMetric):
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
name: str,
|
|
28
|
+
dag: DeepAcyclicGraph,
|
|
29
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
30
|
+
threshold: float = 0.5,
|
|
31
|
+
include_reason: bool = True,
|
|
32
|
+
async_mode: bool = True,
|
|
33
|
+
strict_mode: bool = False,
|
|
34
|
+
verbose_mode: bool = False,
|
|
35
|
+
_include_dag_suffix: bool = True,
|
|
36
|
+
):
|
|
37
|
+
if (
|
|
38
|
+
is_valid_dag_from_roots(
|
|
39
|
+
root_nodes=dag.root_nodes, multiturn=dag.multiturn
|
|
40
|
+
)
|
|
41
|
+
== False
|
|
42
|
+
):
|
|
43
|
+
raise ValueError("Cycle detected in DAG graph.")
|
|
44
|
+
|
|
45
|
+
self._verbose_steps: List[str] = []
|
|
46
|
+
self.dag = copy_graph(dag)
|
|
47
|
+
self.name = name
|
|
48
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
49
|
+
self.evaluation_model = self.model.get_model_name()
|
|
50
|
+
self.threshold = 1 if strict_mode else threshold
|
|
51
|
+
self.include_reason = include_reason
|
|
52
|
+
self.strict_mode = strict_mode
|
|
53
|
+
self.async_mode = async_mode
|
|
54
|
+
self.verbose_mode = verbose_mode
|
|
55
|
+
self._include_dag_suffix = _include_dag_suffix
|
|
56
|
+
|
|
57
|
+
def measure(
|
|
58
|
+
self,
|
|
59
|
+
test_case: ConversationalTestCase,
|
|
60
|
+
_show_indicator: bool = True,
|
|
61
|
+
_in_component: bool = False,
|
|
62
|
+
) -> float:
|
|
63
|
+
check_conversational_test_case_params(
|
|
64
|
+
test_case,
|
|
65
|
+
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
66
|
+
self,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
70
|
+
with metric_progress_indicator(
|
|
71
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
72
|
+
):
|
|
73
|
+
if self.async_mode:
|
|
74
|
+
loop = get_or_create_event_loop()
|
|
75
|
+
loop.run_until_complete(
|
|
76
|
+
self.a_measure(
|
|
77
|
+
test_case,
|
|
78
|
+
_show_indicator=False,
|
|
79
|
+
_in_component=_in_component,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
self.dag._execute(metric=self, test_case=test_case)
|
|
84
|
+
self.success = self.is_successful()
|
|
85
|
+
self.verbose_logs = construct_verbose_logs(
|
|
86
|
+
self,
|
|
87
|
+
steps=[
|
|
88
|
+
*self._verbose_steps,
|
|
89
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
|
+
],
|
|
91
|
+
)
|
|
92
|
+
return self.score
|
|
93
|
+
|
|
94
|
+
async def a_measure(
|
|
95
|
+
self,
|
|
96
|
+
test_case: ConversationalTestCase,
|
|
97
|
+
_show_indicator: bool = True,
|
|
98
|
+
_in_component: bool = False,
|
|
99
|
+
) -> float:
|
|
100
|
+
check_conversational_test_case_params(
|
|
101
|
+
test_case,
|
|
102
|
+
extract_required_params(self.dag.root_nodes, multiturn=True),
|
|
103
|
+
self,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
107
|
+
with metric_progress_indicator(
|
|
108
|
+
self,
|
|
109
|
+
async_mode=True,
|
|
110
|
+
_show_indicator=_show_indicator,
|
|
111
|
+
_in_component=_in_component,
|
|
112
|
+
):
|
|
113
|
+
await self.dag._a_execute(metric=self, test_case=test_case)
|
|
114
|
+
self.success = self.is_successful()
|
|
115
|
+
self.verbose_logs = construct_verbose_logs(
|
|
116
|
+
self,
|
|
117
|
+
steps=[
|
|
118
|
+
*self._verbose_steps,
|
|
119
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
return self.score
|
|
123
|
+
|
|
124
|
+
def is_successful(self) -> bool:
|
|
125
|
+
if self.error is not None:
|
|
126
|
+
self.success = False
|
|
127
|
+
else:
|
|
128
|
+
try:
|
|
129
|
+
self.success = self.score >= self.threshold
|
|
130
|
+
except:
|
|
131
|
+
self.success = False
|
|
132
|
+
return self.success
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def __name__(self):
|
|
136
|
+
if self._include_dag_suffix:
|
|
137
|
+
return f"{self.name} [ConversationalDAG]"
|
|
138
|
+
else:
|
|
139
|
+
return self.name
|