deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -5
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +2 -3
- deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
- deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
- deepeval/benchmarks/math_qa/math_qa.py +2 -2
- deepeval/benchmarks/mmlu/mmlu.py +2 -2
- deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
- deepeval/cli/main.py +561 -727
- deepeval/confident/api.py +30 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/integrations/pydantic_ai/__init__.py +2 -4
- deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
- deepeval/integrations/pydantic_ai/patcher.py +376 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/kimi_model.py +1 -1
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/perf_epoch_bridge.py +4 -4
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
- deepeval/env.py +0 -35
- deepeval/integrations/pydantic_ai/agent.py +0 -364
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
deepeval/tracing/tracing.py
CHANGED
|
@@ -9,16 +9,16 @@ import atexit
|
|
|
9
9
|
import queue
|
|
10
10
|
import uuid
|
|
11
11
|
import os
|
|
12
|
+
import json
|
|
13
|
+
import time
|
|
12
14
|
from openai import OpenAI
|
|
13
15
|
from rich.console import Console
|
|
14
16
|
from rich.progress import Progress
|
|
15
17
|
|
|
16
|
-
|
|
18
|
+
from deepeval.config.settings import get_settings
|
|
17
19
|
from deepeval.constants import (
|
|
18
20
|
CONFIDENT_TRACE_VERBOSE,
|
|
19
21
|
CONFIDENT_TRACE_FLUSH,
|
|
20
|
-
CONFIDENT_SAMPLE_RATE,
|
|
21
|
-
CONFIDENT_TRACE_ENVIRONMENT,
|
|
22
22
|
)
|
|
23
23
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
24
24
|
from deepeval.metrics import BaseMetric
|
|
@@ -50,12 +50,16 @@ from deepeval.tracing.utils import (
|
|
|
50
50
|
tracing_enabled,
|
|
51
51
|
validate_environment,
|
|
52
52
|
validate_sampling_rate,
|
|
53
|
+
dump_body_to_json_file,
|
|
54
|
+
get_deepeval_trace_mode,
|
|
53
55
|
)
|
|
54
56
|
from deepeval.utils import dataclass_to_dict
|
|
55
57
|
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
56
58
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
57
59
|
from deepeval.tracing.api import PromptApi
|
|
58
60
|
|
|
61
|
+
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
62
|
+
|
|
59
63
|
|
|
60
64
|
class TraceManager:
|
|
61
65
|
def __init__(self):
|
|
@@ -65,25 +69,27 @@ class TraceManager:
|
|
|
65
69
|
{}
|
|
66
70
|
) # Map of span_uuid to BaseSpan
|
|
67
71
|
|
|
72
|
+
settings = get_settings()
|
|
68
73
|
# Initialize queue and worker thread for trace posting
|
|
69
74
|
self._trace_queue = queue.Queue()
|
|
70
75
|
self._worker_thread = None
|
|
71
76
|
self._min_interval = 0.2 # Minimum time between API calls (seconds)
|
|
72
77
|
self._last_post_time = 0
|
|
73
78
|
self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
|
|
74
|
-
self.
|
|
75
|
-
|
|
76
|
-
)
|
|
79
|
+
self._flush_enabled = bool(settings.CONFIDENT_TRACE_FLUSH)
|
|
80
|
+
self._daemon = not self._flush_enabled
|
|
77
81
|
|
|
78
82
|
# trace manager attributes
|
|
79
83
|
self.confident_api_key = None
|
|
80
84
|
self.custom_mask_fn: Optional[Callable] = None
|
|
81
|
-
self.environment =
|
|
82
|
-
CONFIDENT_TRACE_ENVIRONMENT
|
|
85
|
+
self.environment = (
|
|
86
|
+
settings.CONFIDENT_TRACE_ENVIRONMENT
|
|
87
|
+
if settings.CONFIDENT_TRACE_ENVIRONMENT is not None
|
|
88
|
+
else Environment.DEVELOPMENT.value
|
|
83
89
|
)
|
|
84
90
|
validate_environment(self.environment)
|
|
85
91
|
|
|
86
|
-
self.sampling_rate =
|
|
92
|
+
self.sampling_rate = settings.CONFIDENT_SAMPLE_RATE
|
|
87
93
|
validate_sampling_rate(self.sampling_rate)
|
|
88
94
|
self.openai_client = None
|
|
89
95
|
self.tracing_enabled = True
|
|
@@ -103,7 +109,8 @@ class TraceManager:
|
|
|
103
109
|
queue_size = self._trace_queue.qsize()
|
|
104
110
|
in_flight = len(self._in_flight_tasks)
|
|
105
111
|
remaining_tasks = queue_size + in_flight
|
|
106
|
-
|
|
112
|
+
|
|
113
|
+
if not self._flush_enabled and remaining_tasks > 0:
|
|
107
114
|
self._print_trace_status(
|
|
108
115
|
message=f"WARNING: Exiting with {queue_size + in_flight} abaonded trace(s).",
|
|
109
116
|
trace_worker_status=TraceWorkerStatus.WARNING,
|
|
@@ -179,8 +186,14 @@ class TraceManager:
|
|
|
179
186
|
if trace.status == TraceSpanStatus.IN_PROGRESS:
|
|
180
187
|
trace.status = TraceSpanStatus.SUCCESS
|
|
181
188
|
|
|
189
|
+
mode = get_deepeval_trace_mode()
|
|
190
|
+
if mode == "gen":
|
|
191
|
+
body = self.create_trace_api(trace).model_dump(
|
|
192
|
+
by_alias=True, exclude_none=True
|
|
193
|
+
)
|
|
194
|
+
dump_body_to_json_file(body)
|
|
182
195
|
# Post the trace to the server before removing it
|
|
183
|
-
|
|
196
|
+
elif not self.evaluating:
|
|
184
197
|
self.post_trace(trace)
|
|
185
198
|
else:
|
|
186
199
|
if self.evaluation_loop:
|
|
@@ -237,6 +250,15 @@ class TraceManager:
|
|
|
237
250
|
# This is a child span, find its parent and add it to the parent's children
|
|
238
251
|
parent_span = self.get_span_by_uuid(span.parent_uuid)
|
|
239
252
|
if parent_span:
|
|
253
|
+
|
|
254
|
+
if (
|
|
255
|
+
parent_span.name == EVAL_DUMMY_SPAN_NAME
|
|
256
|
+
): # ignored span for evaluation
|
|
257
|
+
span.parent_uuid = None
|
|
258
|
+
trace.root_spans.remove(parent_span)
|
|
259
|
+
trace.root_spans.append(span)
|
|
260
|
+
return
|
|
261
|
+
|
|
240
262
|
parent_span.children.append(span)
|
|
241
263
|
else:
|
|
242
264
|
trace.root_spans.append(span)
|
|
@@ -274,10 +296,7 @@ class TraceManager:
|
|
|
274
296
|
description: Optional[str] = None,
|
|
275
297
|
environment: Optional[str] = None,
|
|
276
298
|
):
|
|
277
|
-
if (
|
|
278
|
-
os.getenv(CONFIDENT_TRACE_VERBOSE, "YES").upper() != "NO"
|
|
279
|
-
and self.evaluating is False
|
|
280
|
-
):
|
|
299
|
+
if get_settings().CONFIDENT_TRACE_VERBOSE and self.evaluating is False:
|
|
281
300
|
console = Console()
|
|
282
301
|
message_prefix = "[dim][Confident AI Trace Log][/dim]"
|
|
283
302
|
if trace_worker_status == TraceWorkerStatus.SUCCESS:
|
|
@@ -401,6 +420,7 @@ class TraceManager:
|
|
|
401
420
|
api = Api(api_key=trace_api.confident_api_key)
|
|
402
421
|
else:
|
|
403
422
|
api = Api(api_key=self.confident_api_key)
|
|
423
|
+
|
|
404
424
|
api_response, link = await api.a_send_request(
|
|
405
425
|
method=HttpMethods.POST,
|
|
406
426
|
endpoint=Endpoints.TRACES_ENDPOINT,
|
|
@@ -415,7 +435,7 @@ class TraceManager:
|
|
|
415
435
|
description=link,
|
|
416
436
|
environment=self.environment,
|
|
417
437
|
)
|
|
418
|
-
elif
|
|
438
|
+
elif self._flush_enabled:
|
|
419
439
|
# Main thread gone → to be flushed
|
|
420
440
|
remaining_trace_request_bodies.append(body)
|
|
421
441
|
|
|
@@ -492,6 +512,7 @@ class TraceManager:
|
|
|
492
512
|
with capture_send_trace():
|
|
493
513
|
try:
|
|
494
514
|
api = Api(api_key=self.confident_api_key)
|
|
515
|
+
|
|
495
516
|
_, link = api.send_request(
|
|
496
517
|
method=HttpMethods.POST,
|
|
497
518
|
endpoint=Endpoints.TRACES_ENDPOINT,
|
deepeval/tracing/utils.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import time
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
import difflib
|
|
2
7
|
from datetime import datetime, timezone
|
|
3
8
|
from enum import Enum
|
|
4
9
|
from time import perf_counter
|
|
10
|
+
import time
|
|
5
11
|
from collections import deque
|
|
12
|
+
from typing import Any, Dict, Optional, Sequence, Callable
|
|
6
13
|
|
|
7
14
|
from deepeval.constants import CONFIDENT_TRACING_ENABLED
|
|
8
15
|
|
|
@@ -14,6 +21,12 @@ class Environment(Enum):
|
|
|
14
21
|
TESTING = "testing"
|
|
15
22
|
|
|
16
23
|
|
|
24
|
+
def _strip_nul(s: str) -> str:
|
|
25
|
+
# Replace embedded NUL, which Postgres cannot store in text/jsonb
|
|
26
|
+
# Do NOT try to escape as \u0000 because PG will still reject it.
|
|
27
|
+
return s.replace("\x00", "")
|
|
28
|
+
|
|
29
|
+
|
|
17
30
|
def tracing_enabled():
|
|
18
31
|
return os.getenv(CONFIDENT_TRACING_ENABLED, "YES").upper() == "YES"
|
|
19
32
|
|
|
@@ -42,6 +55,11 @@ def make_json_serializable(obj):
|
|
|
42
55
|
|
|
43
56
|
def _serialize(o):
|
|
44
57
|
oid = id(o)
|
|
58
|
+
|
|
59
|
+
# strip Nulls
|
|
60
|
+
if isinstance(o, str):
|
|
61
|
+
return _strip_nul(o)
|
|
62
|
+
|
|
45
63
|
# Primitive types are already serializable
|
|
46
64
|
if isinstance(o, (str, int, float, bool)) or o is None:
|
|
47
65
|
return o
|
|
@@ -77,7 +95,7 @@ def make_json_serializable(obj):
|
|
|
77
95
|
return result
|
|
78
96
|
|
|
79
97
|
# Fallback: convert to string
|
|
80
|
-
return str(o)
|
|
98
|
+
return _strip_nul(str(o))
|
|
81
99
|
|
|
82
100
|
return _serialize(obj)
|
|
83
101
|
|
|
@@ -115,3 +133,82 @@ def replace_self_with_class_name(obj):
|
|
|
115
133
|
return f"<{obj.__class__.__name__}>"
|
|
116
134
|
except:
|
|
117
135
|
return f"<self>"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_deepeval_trace_mode() -> Optional[str]:
|
|
139
|
+
deepeval_trace_mode = None
|
|
140
|
+
try:
|
|
141
|
+
args = sys.argv
|
|
142
|
+
for idx, arg in enumerate(args):
|
|
143
|
+
if isinstance(arg, str) and arg.startswith(
|
|
144
|
+
"--deepeval-trace-mode="
|
|
145
|
+
):
|
|
146
|
+
deepeval_trace_mode = (
|
|
147
|
+
arg.split("=", 1)[1].strip().strip('"').strip("'").lower()
|
|
148
|
+
)
|
|
149
|
+
break
|
|
150
|
+
if arg == "--deepeval-trace-mode" and idx + 1 < len(args):
|
|
151
|
+
deepeval_trace_mode = (
|
|
152
|
+
str(args[idx + 1]).strip().strip('"').strip("'").lower()
|
|
153
|
+
)
|
|
154
|
+
break
|
|
155
|
+
except Exception:
|
|
156
|
+
deepeval_trace_mode = None
|
|
157
|
+
|
|
158
|
+
return deepeval_trace_mode
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def dump_body_to_json_file(
|
|
162
|
+
body: Dict[str, Any], file_path: Optional[str] = None
|
|
163
|
+
) -> str:
|
|
164
|
+
entry_file = None
|
|
165
|
+
try:
|
|
166
|
+
cmd0 = sys.argv[0] if sys.argv else None
|
|
167
|
+
if cmd0 and cmd0.endswith(".py"):
|
|
168
|
+
entry_file = cmd0
|
|
169
|
+
else:
|
|
170
|
+
for frame_info in reversed(inspect.stack()):
|
|
171
|
+
fp = frame_info.filename
|
|
172
|
+
if (
|
|
173
|
+
fp
|
|
174
|
+
and fp.endswith(".py")
|
|
175
|
+
and "deepeval/tracing" not in fp
|
|
176
|
+
and "site-packages" not in fp
|
|
177
|
+
):
|
|
178
|
+
entry_file = fp
|
|
179
|
+
break
|
|
180
|
+
except Exception:
|
|
181
|
+
entry_file = None
|
|
182
|
+
|
|
183
|
+
if not entry_file:
|
|
184
|
+
entry_file = "unknown.py"
|
|
185
|
+
|
|
186
|
+
abs_entry = os.path.abspath(entry_file)
|
|
187
|
+
dir_path = os.path.dirname(abs_entry)
|
|
188
|
+
|
|
189
|
+
file_arg = None
|
|
190
|
+
try:
|
|
191
|
+
for idx, arg in enumerate(sys.argv):
|
|
192
|
+
if isinstance(arg, str) and arg.startswith(
|
|
193
|
+
"--deepeval-trace-file-name="
|
|
194
|
+
):
|
|
195
|
+
file_arg = arg.split("=", 1)[1].strip().strip('"').strip("'")
|
|
196
|
+
break
|
|
197
|
+
if arg == "--deepeval-trace-file-name" and idx + 1 < len(sys.argv):
|
|
198
|
+
file_arg = str(sys.argv[idx + 1]).strip().strip('"').strip("'")
|
|
199
|
+
break
|
|
200
|
+
except Exception:
|
|
201
|
+
file_arg = None
|
|
202
|
+
|
|
203
|
+
if file_path:
|
|
204
|
+
dst_path = os.path.abspath(file_path)
|
|
205
|
+
elif file_arg:
|
|
206
|
+
dst_path = os.path.abspath(file_arg)
|
|
207
|
+
else:
|
|
208
|
+
base_name = os.path.splitext(os.path.basename(abs_entry))[0]
|
|
209
|
+
dst_path = os.path.join(dir_path, f"{base_name}.json")
|
|
210
|
+
|
|
211
|
+
actual_body = make_json_serializable(body)
|
|
212
|
+
with open(dst_path, "w", encoding="utf-8") as f:
|
|
213
|
+
json.dump(actual_body, f, ensure_ascii=False, indent=2, sort_keys=True)
|
|
214
|
+
return dst_path
|
deepeval/utils.py
CHANGED
|
@@ -1,26 +1,34 @@
|
|
|
1
|
-
from contextvars import ContextVar
|
|
2
|
-
from enum import Enum
|
|
3
1
|
import copy
|
|
4
2
|
import os
|
|
5
3
|
import json
|
|
6
4
|
import time
|
|
7
|
-
from typing import Any, Optional, Dict, List, Union
|
|
8
|
-
from collections.abc import Iterable
|
|
9
5
|
import webbrowser
|
|
10
6
|
import tqdm
|
|
11
7
|
import re
|
|
12
8
|
import string
|
|
13
|
-
from dataclasses import asdict, is_dataclass
|
|
14
|
-
import re
|
|
15
9
|
import asyncio
|
|
16
10
|
import nest_asyncio
|
|
17
11
|
import uuid
|
|
12
|
+
import math
|
|
13
|
+
|
|
14
|
+
from contextvars import ContextVar
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import Any, Optional, Dict, List, Union
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
from dataclasses import asdict, is_dataclass
|
|
18
19
|
from pydantic import BaseModel
|
|
19
20
|
from rich.progress import Progress
|
|
20
21
|
from rich.console import Console, Theme
|
|
21
22
|
|
|
22
23
|
from deepeval.confident.api import set_confident_api_key
|
|
23
24
|
from deepeval.constants import CONFIDENT_OPEN_BROWSER
|
|
25
|
+
from deepeval.config.settings import get_settings
|
|
26
|
+
from deepeval.config.utils import (
|
|
27
|
+
parse_bool,
|
|
28
|
+
get_env_bool,
|
|
29
|
+
bool_to_env_str,
|
|
30
|
+
set_env_bool,
|
|
31
|
+
)
|
|
24
32
|
|
|
25
33
|
|
|
26
34
|
def get_lcs(seq1, seq2):
|
|
@@ -140,82 +148,55 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
|
|
|
140
148
|
return loop
|
|
141
149
|
|
|
142
150
|
|
|
143
|
-
def
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
else:
|
|
148
|
-
return False
|
|
149
|
-
except:
|
|
150
|
-
return False
|
|
151
|
+
def set_should_skip_on_missing_params(yes: bool):
|
|
152
|
+
s = get_settings()
|
|
153
|
+
with s.edit(persist=False):
|
|
154
|
+
s.SKIP_DEEPEVAL_MISSING_PARAMS = yes
|
|
151
155
|
|
|
152
156
|
|
|
153
|
-
def
|
|
154
|
-
|
|
155
|
-
os.environ["SKIP_DEEPEVAL_MISSING_PARAMS"] = "YES"
|
|
156
|
-
else:
|
|
157
|
-
os.environ["SKIP_DEEPEVAL_MISSING_PARAMS"] = "NO"
|
|
157
|
+
def should_ignore_errors() -> bool:
|
|
158
|
+
return bool(get_settings().IGNORE_DEEPEVAL_ERRORS)
|
|
158
159
|
|
|
159
160
|
|
|
160
|
-
def
|
|
161
|
-
|
|
162
|
-
if os.environ["IGNORE_DEEPEVAL_ERRORS"] == "YES":
|
|
163
|
-
return True
|
|
164
|
-
else:
|
|
165
|
-
return False
|
|
166
|
-
except:
|
|
167
|
-
return False
|
|
161
|
+
def should_skip_on_missing_params() -> bool:
|
|
162
|
+
return bool(get_settings().SKIP_DEEPEVAL_MISSING_PARAMS)
|
|
168
163
|
|
|
169
164
|
|
|
170
165
|
def set_should_ignore_errors(yes: bool):
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
os.environ["IGNORE_DEEPEVAL_ERRORS"] = "NO"
|
|
166
|
+
s = get_settings()
|
|
167
|
+
with s.edit(persist=False):
|
|
168
|
+
s.IGNORE_DEEPEVAL_ERRORS = yes
|
|
175
169
|
|
|
176
170
|
|
|
177
|
-
def should_verbose_print() ->
|
|
178
|
-
|
|
179
|
-
if os.environ["DEEPEVAL_VERBOSE_MODE"] == "YES":
|
|
180
|
-
return True
|
|
181
|
-
else:
|
|
182
|
-
return None
|
|
183
|
-
except:
|
|
184
|
-
return None
|
|
171
|
+
def should_verbose_print() -> bool:
|
|
172
|
+
return bool(get_settings().DEEPEVAL_VERBOSE_MODE)
|
|
185
173
|
|
|
186
174
|
|
|
187
175
|
def set_verbose_mode(yes: Optional[bool]):
|
|
188
|
-
|
|
189
|
-
|
|
176
|
+
s = get_settings()
|
|
177
|
+
with s.edit(persist=False):
|
|
178
|
+
s.DEEPEVAL_VERBOSE_MODE = yes
|
|
190
179
|
|
|
191
180
|
|
|
192
181
|
def set_identifier(identifier: Optional[str]):
|
|
193
182
|
if identifier:
|
|
194
|
-
|
|
183
|
+
s = get_settings()
|
|
184
|
+
with s.edit(persist=False):
|
|
185
|
+
s.DEEPEVAL_IDENTIFIER = identifier
|
|
195
186
|
|
|
196
187
|
|
|
197
188
|
def get_identifier() -> Optional[str]:
|
|
198
|
-
|
|
199
|
-
return os.environ["DEEPEVAL_IDENTIFIER"]
|
|
200
|
-
except:
|
|
201
|
-
return None
|
|
189
|
+
return get_settings().DEEPEVAL_IDENTIFIER
|
|
202
190
|
|
|
203
191
|
|
|
204
|
-
def should_use_cache():
|
|
205
|
-
|
|
206
|
-
if os.environ["ENABLE_DEEPEVAL_CACHE"] == "YES":
|
|
207
|
-
return True
|
|
208
|
-
else:
|
|
209
|
-
return False
|
|
210
|
-
except:
|
|
211
|
-
return False
|
|
192
|
+
def should_use_cache() -> bool:
|
|
193
|
+
return bool(get_settings().ENABLE_DEEPEVAL_CACHE)
|
|
212
194
|
|
|
213
195
|
|
|
214
196
|
def set_should_use_cache(yes: bool):
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
os.environ["ENABLE_DEEPEVAL_CACHE"] = "NO"
|
|
197
|
+
s = get_settings()
|
|
198
|
+
with s.edit(persist=False):
|
|
199
|
+
s.ENABLE_DEEPEVAL_CACHE = yes
|
|
219
200
|
|
|
220
201
|
|
|
221
202
|
def login(api_key: str):
|
|
@@ -233,17 +214,11 @@ def login(api_key: str):
|
|
|
233
214
|
|
|
234
215
|
|
|
235
216
|
def set_is_running_deepeval(flag: bool):
|
|
236
|
-
|
|
237
|
-
os.environ["DEEPEVAL"] = "YES"
|
|
238
|
-
else:
|
|
239
|
-
os.environ["DEEPEVAL"] = "NO"
|
|
217
|
+
set_env_bool("DEEPEVAL", flag)
|
|
240
218
|
|
|
241
219
|
|
|
242
220
|
def get_is_running_deepeval() -> bool:
|
|
243
|
-
|
|
244
|
-
return os.environ["DEEPEVAL"] == "YES"
|
|
245
|
-
except:
|
|
246
|
-
return False
|
|
221
|
+
return get_env_bool("DEEPEVAL")
|
|
247
222
|
|
|
248
223
|
|
|
249
224
|
def is_in_ci_env() -> bool:
|
|
@@ -270,8 +245,8 @@ def is_in_ci_env() -> bool:
|
|
|
270
245
|
|
|
271
246
|
|
|
272
247
|
def open_browser(url: str):
|
|
273
|
-
if
|
|
274
|
-
if is_in_ci_env()
|
|
248
|
+
if get_settings().CONFIDENT_OPEN_BROWSER:
|
|
249
|
+
if not is_in_ci_env():
|
|
275
250
|
webbrowser.open(url)
|
|
276
251
|
|
|
277
252
|
|
|
@@ -439,6 +414,8 @@ def normalize_text(text: str) -> str:
|
|
|
439
414
|
|
|
440
415
|
|
|
441
416
|
def get_freer_gpu():
|
|
417
|
+
import numpy as np
|
|
418
|
+
|
|
442
419
|
os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi")
|
|
443
420
|
memory_available = [
|
|
444
421
|
int(x.split()[2]) + 5 * i
|
|
@@ -466,8 +443,11 @@ def wait_free_gpu(gb_needed):
|
|
|
466
443
|
def select_freer_gpu():
|
|
467
444
|
freer_gpu = str(get_freer_gpu())
|
|
468
445
|
print("Will use GPU: %s" % (freer_gpu))
|
|
469
|
-
|
|
470
|
-
|
|
446
|
+
|
|
447
|
+
s = get_settings()
|
|
448
|
+
with s.edit(persist=False):
|
|
449
|
+
s.CUDA_LAUNCH_BLOCKING = True
|
|
450
|
+
s.CUDA_VISIBLE_DEVICES = freer_gpu
|
|
471
451
|
return freer_gpu
|
|
472
452
|
|
|
473
453
|
|
|
@@ -535,6 +515,67 @@ def remove_pbars(
|
|
|
535
515
|
progress.remove_task(pbar_id)
|
|
536
516
|
|
|
537
517
|
|
|
518
|
+
def read_env_int(
|
|
519
|
+
name: str, default: int, *, min_value: Union[int, None] = None
|
|
520
|
+
) -> int:
|
|
521
|
+
"""Read an integer from an environment variable with safe fallback.
|
|
522
|
+
|
|
523
|
+
Attempts to read os.environ[name] and parse it as an int. If the variable
|
|
524
|
+
is unset, cannot be parsed, or is less than `min_value` (when provided),
|
|
525
|
+
the function returns `default`.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
name: Environment variable name to read.
|
|
529
|
+
default: Value to return when the env var is missing/invalid/out of range.
|
|
530
|
+
min_value: Optional inclusive lower bound; values < min_value are rejected.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
The parsed integer, or `default` on any failure.
|
|
534
|
+
"""
|
|
535
|
+
raw = os.getenv(name)
|
|
536
|
+
if raw is None:
|
|
537
|
+
return default
|
|
538
|
+
try:
|
|
539
|
+
v = int(raw)
|
|
540
|
+
if min_value is not None and v < min_value:
|
|
541
|
+
return default
|
|
542
|
+
return v
|
|
543
|
+
except Exception:
|
|
544
|
+
return default
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def read_env_float(
|
|
548
|
+
name: str, default: float, *, min_value: Union[float, None] = None
|
|
549
|
+
) -> float:
|
|
550
|
+
"""Read a float from an environment variable with safe fallback.
|
|
551
|
+
|
|
552
|
+
Attempts to read os.environ[name] and parse it as a float. If the variable
|
|
553
|
+
is unset, cannot be parsed, or is less than `min_value` (when provided),
|
|
554
|
+
the function returns `default`.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
name: Environment variable name to read.
|
|
558
|
+
default: Value to return when the env var is missing/invalid/out of range.
|
|
559
|
+
min_value: Optional inclusive lower bound; values < min_value are rejected.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
The parsed float, or `default` on any failure.
|
|
563
|
+
"""
|
|
564
|
+
raw = os.getenv(name)
|
|
565
|
+
if raw is None:
|
|
566
|
+
return default
|
|
567
|
+
try:
|
|
568
|
+
v = float(raw)
|
|
569
|
+
except Exception:
|
|
570
|
+
return default
|
|
571
|
+
|
|
572
|
+
if not math.isfinite(v):
|
|
573
|
+
return default
|
|
574
|
+
if min_value is not None and v < min_value:
|
|
575
|
+
return default
|
|
576
|
+
return v
|
|
577
|
+
|
|
578
|
+
|
|
538
579
|
my_theme = Theme(
|
|
539
580
|
{
|
|
540
581
|
"bar.complete": "#11ff00",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -25,6 +25,8 @@ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
|
|
|
25
25
|
Requires-Dist: opentelemetry-sdk (>=1.24.0,<2.0.0)
|
|
26
26
|
Requires-Dist: portalocker
|
|
27
27
|
Requires-Dist: posthog (>=6.3.0,<7.0.0)
|
|
28
|
+
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
|
29
|
+
Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
|
|
28
30
|
Requires-Dist: pyfiglet
|
|
29
31
|
Requires-Dist: pytest
|
|
30
32
|
Requires-Dist: pytest-asyncio
|
|
@@ -187,16 +189,6 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
|
|
|
187
189
|
```
|
|
188
190
|
pip install -U deepeval
|
|
189
191
|
```
|
|
190
|
-
### Environment variables (.env / .env.local)
|
|
191
|
-
|
|
192
|
-
DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
|
|
193
|
-
**Precedence:** process env -> `.env.local` -> `.env`.
|
|
194
|
-
Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
|
|
195
|
-
|
|
196
|
-
```bash
|
|
197
|
-
cp .env.example .env.local
|
|
198
|
-
# then edit .env.local (ignored by git)
|
|
199
|
-
```
|
|
200
192
|
|
|
201
193
|
## Create an account (highly recommended)
|
|
202
194
|
|
|
@@ -389,9 +381,20 @@ evaluate(dataset, [answer_relevancy_metric])
|
|
|
389
381
|
dataset.evaluate([answer_relevancy_metric])
|
|
390
382
|
```
|
|
391
383
|
|
|
392
|
-
|
|
384
|
+
## A Note on Env Variables (.env / .env.local)
|
|
385
|
+
|
|
386
|
+
DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
|
|
387
|
+
**Precedence:** process env -> `.env.local` -> `.env`.
|
|
388
|
+
Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
cp .env.example .env.local
|
|
392
|
+
# then edit .env.local (ignored by git)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
# DeepEval With Confident AI
|
|
393
396
|
|
|
394
|
-
|
|
397
|
+
DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
|
|
395
398
|
|
|
396
399
|
1. Curate/annotate evaluation datasets on the cloud
|
|
397
400
|
2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|