deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -7
- deepeval/_version.py +1 -1
- deepeval/cli/dotenv_handler.py +71 -0
- deepeval/cli/main.py +1021 -280
- deepeval/cli/utils.py +116 -2
- deepeval/confident/api.py +29 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/key_handler.py +64 -2
- deepeval/metrics/__init__.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/conversational_dag/__init__.py +7 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
- deepeval/metrics/conversational_dag/nodes.py +931 -0
- deepeval/metrics/conversational_dag/templates.py +117 -0
- deepeval/metrics/dag/dag.py +13 -4
- deepeval/metrics/dag/graph.py +47 -15
- deepeval/metrics/dag/utils.py +103 -38
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/synthesizer/chunking/doc_chunker.py +87 -51
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
- deepeval/env.py +0 -35
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
deepeval/test_run/api.py
CHANGED
|
@@ -50,6 +50,7 @@ class LLMApiTestCase(BaseModel):
|
|
|
50
50
|
trace: Optional[TraceApi] = Field(None)
|
|
51
51
|
|
|
52
52
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
53
|
+
# metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
53
54
|
|
|
54
55
|
def update_metric_data(self, metric_data: MetricData):
|
|
55
56
|
if self.metrics_data is None:
|
|
@@ -30,6 +30,7 @@ from deepeval.tracing.otel.utils import (
|
|
|
30
30
|
to_hex_string,
|
|
31
31
|
parse_string,
|
|
32
32
|
parse_list_of_strings,
|
|
33
|
+
post_test_run,
|
|
33
34
|
)
|
|
34
35
|
from deepeval.tracing import perf_epoch_bridge as peb
|
|
35
36
|
from deepeval.tracing.types import TraceAttributes
|
|
@@ -80,7 +81,8 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
80
81
|
self,
|
|
81
82
|
spans: typing.Sequence[ReadableSpan],
|
|
82
83
|
timeout_millis: int = 30000,
|
|
83
|
-
api_key: Optional[str] = None, # dynamic api key
|
|
84
|
+
api_key: Optional[str] = None, # dynamic api key,
|
|
85
|
+
_test_run_id: Optional[str] = None,
|
|
84
86
|
) -> SpanExportResult:
|
|
85
87
|
# build forest of spans
|
|
86
88
|
forest = self._build_span_forest(spans)
|
|
@@ -223,14 +225,24 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
223
225
|
trace_manager.add_span_to_trace(base_span_wrapper.base_span)
|
|
224
226
|
# no removing span because it can be parent of other spans
|
|
225
227
|
|
|
226
|
-
# safely end all active traces
|
|
228
|
+
# safely end all active traces or return them for test runs
|
|
227
229
|
active_traces_keys = list(trace_manager.active_traces.keys())
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
230
|
+
if _test_run_id:
|
|
231
|
+
traces = []
|
|
232
|
+
for trace_key in active_traces_keys:
|
|
233
|
+
set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
|
|
234
|
+
trace = trace_manager.get_trace_by_uuid(trace_key)
|
|
235
|
+
if trace:
|
|
236
|
+
traces.append(trace)
|
|
237
|
+
trace_manager.clear_traces()
|
|
238
|
+
post_test_run(traces, _test_run_id)
|
|
239
|
+
return SpanExportResult.SUCCESS
|
|
240
|
+
else:
|
|
241
|
+
for trace_key in active_traces_keys:
|
|
242
|
+
set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
|
|
243
|
+
trace_manager.end_trace(trace_key)
|
|
244
|
+
trace_manager.clear_traces()
|
|
245
|
+
return SpanExportResult.SUCCESS
|
|
234
246
|
|
|
235
247
|
def _convert_readable_span_to_base_span(
|
|
236
248
|
self, span: ReadableSpan
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Tuple, Any
|
|
2
2
|
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
3
|
+
from deepeval.tracing import trace_manager, BaseSpan
|
|
3
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
4
5
|
import json
|
|
5
6
|
|
|
@@ -250,3 +251,59 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
|
|
|
250
251
|
else:
|
|
251
252
|
parsed_context.append(context_str)
|
|
252
253
|
return parsed_context
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
from deepeval.evaluate.utils import create_api_test_case
|
|
257
|
+
from deepeval.test_run.api import LLMApiTestCase
|
|
258
|
+
from deepeval.test_run.test_run import global_test_run_manager
|
|
259
|
+
from typing import Optional
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
263
|
+
# Accept single trace or list of traces
|
|
264
|
+
if isinstance(traces, Trace):
|
|
265
|
+
traces = [traces]
|
|
266
|
+
|
|
267
|
+
api_test_cases: List[LLMApiTestCase] = []
|
|
268
|
+
|
|
269
|
+
# Collect test cases from spans that have metric_collection
|
|
270
|
+
for trace in traces:
|
|
271
|
+
trace_api = trace_manager.create_trace_api(trace)
|
|
272
|
+
|
|
273
|
+
def dfs(span: BaseSpan):
|
|
274
|
+
if span.metric_collection:
|
|
275
|
+
llm_test_case = LLMTestCase(
|
|
276
|
+
input=str(span.input),
|
|
277
|
+
actual_output=(
|
|
278
|
+
str(span.output) if span.output is not None else None
|
|
279
|
+
),
|
|
280
|
+
expected_output=span.expected_output,
|
|
281
|
+
context=span.context,
|
|
282
|
+
retrieval_context=span.retrieval_context,
|
|
283
|
+
tools_called=span.tools_called,
|
|
284
|
+
expected_tools=span.expected_tools,
|
|
285
|
+
)
|
|
286
|
+
api_case = create_api_test_case(
|
|
287
|
+
test_case=llm_test_case,
|
|
288
|
+
trace=trace_api,
|
|
289
|
+
index=None,
|
|
290
|
+
)
|
|
291
|
+
if isinstance(api_case, LLMApiTestCase):
|
|
292
|
+
api_case.metric_collection = span.metric_collection
|
|
293
|
+
api_test_cases.append(api_case)
|
|
294
|
+
|
|
295
|
+
for child in span.children or []:
|
|
296
|
+
dfs(child)
|
|
297
|
+
|
|
298
|
+
for root in trace.root_spans:
|
|
299
|
+
dfs(root)
|
|
300
|
+
|
|
301
|
+
# Prepare and post TestRun using the global test run manager
|
|
302
|
+
test_run_manager = global_test_run_manager
|
|
303
|
+
test_run_manager.create_test_run(identifier=test_run_id)
|
|
304
|
+
test_run = test_run_manager.get_test_run()
|
|
305
|
+
|
|
306
|
+
for case in api_test_cases:
|
|
307
|
+
test_run.add_test_case(case)
|
|
308
|
+
|
|
309
|
+
# return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -9,16 +9,16 @@ import atexit
|
|
|
9
9
|
import queue
|
|
10
10
|
import uuid
|
|
11
11
|
import os
|
|
12
|
+
import json
|
|
13
|
+
import time
|
|
12
14
|
from openai import OpenAI
|
|
13
15
|
from rich.console import Console
|
|
14
16
|
from rich.progress import Progress
|
|
15
17
|
|
|
16
|
-
|
|
18
|
+
from deepeval.config.settings import get_settings
|
|
17
19
|
from deepeval.constants import (
|
|
18
20
|
CONFIDENT_TRACE_VERBOSE,
|
|
19
21
|
CONFIDENT_TRACE_FLUSH,
|
|
20
|
-
CONFIDENT_SAMPLE_RATE,
|
|
21
|
-
CONFIDENT_TRACE_ENVIRONMENT,
|
|
22
22
|
)
|
|
23
23
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
24
24
|
from deepeval.metrics import BaseMetric
|
|
@@ -50,12 +50,16 @@ from deepeval.tracing.utils import (
|
|
|
50
50
|
tracing_enabled,
|
|
51
51
|
validate_environment,
|
|
52
52
|
validate_sampling_rate,
|
|
53
|
+
dump_body_to_json_file,
|
|
54
|
+
get_deepeval_trace_mode,
|
|
53
55
|
)
|
|
54
56
|
from deepeval.utils import dataclass_to_dict
|
|
55
57
|
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
56
58
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
57
59
|
from deepeval.tracing.api import PromptApi
|
|
58
60
|
|
|
61
|
+
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
62
|
+
|
|
59
63
|
|
|
60
64
|
class TraceManager:
|
|
61
65
|
def __init__(self):
|
|
@@ -65,25 +69,27 @@ class TraceManager:
|
|
|
65
69
|
{}
|
|
66
70
|
) # Map of span_uuid to BaseSpan
|
|
67
71
|
|
|
72
|
+
settings = get_settings()
|
|
68
73
|
# Initialize queue and worker thread for trace posting
|
|
69
74
|
self._trace_queue = queue.Queue()
|
|
70
75
|
self._worker_thread = None
|
|
71
76
|
self._min_interval = 0.2 # Minimum time between API calls (seconds)
|
|
72
77
|
self._last_post_time = 0
|
|
73
78
|
self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
|
|
74
|
-
self.
|
|
75
|
-
|
|
76
|
-
)
|
|
79
|
+
self._flush_enabled = bool(settings.CONFIDENT_TRACE_FLUSH)
|
|
80
|
+
self._daemon = not self._flush_enabled
|
|
77
81
|
|
|
78
82
|
# trace manager attributes
|
|
79
83
|
self.confident_api_key = None
|
|
80
84
|
self.custom_mask_fn: Optional[Callable] = None
|
|
81
|
-
self.environment =
|
|
82
|
-
CONFIDENT_TRACE_ENVIRONMENT
|
|
85
|
+
self.environment = (
|
|
86
|
+
settings.CONFIDENT_TRACE_ENVIRONMENT
|
|
87
|
+
if settings.CONFIDENT_TRACE_ENVIRONMENT is not None
|
|
88
|
+
else Environment.DEVELOPMENT.value
|
|
83
89
|
)
|
|
84
90
|
validate_environment(self.environment)
|
|
85
91
|
|
|
86
|
-
self.sampling_rate =
|
|
92
|
+
self.sampling_rate = settings.CONFIDENT_SAMPLE_RATE
|
|
87
93
|
validate_sampling_rate(self.sampling_rate)
|
|
88
94
|
self.openai_client = None
|
|
89
95
|
self.tracing_enabled = True
|
|
@@ -103,7 +109,8 @@ class TraceManager:
|
|
|
103
109
|
queue_size = self._trace_queue.qsize()
|
|
104
110
|
in_flight = len(self._in_flight_tasks)
|
|
105
111
|
remaining_tasks = queue_size + in_flight
|
|
106
|
-
|
|
112
|
+
|
|
113
|
+
if not self._flush_enabled and remaining_tasks > 0:
|
|
107
114
|
self._print_trace_status(
|
|
108
115
|
message=f"WARNING: Exiting with {queue_size + in_flight} abaonded trace(s).",
|
|
109
116
|
trace_worker_status=TraceWorkerStatus.WARNING,
|
|
@@ -179,8 +186,14 @@ class TraceManager:
|
|
|
179
186
|
if trace.status == TraceSpanStatus.IN_PROGRESS:
|
|
180
187
|
trace.status = TraceSpanStatus.SUCCESS
|
|
181
188
|
|
|
189
|
+
mode = get_deepeval_trace_mode()
|
|
190
|
+
if mode == "gen":
|
|
191
|
+
body = self.create_trace_api(trace).model_dump(
|
|
192
|
+
by_alias=True, exclude_none=True
|
|
193
|
+
)
|
|
194
|
+
dump_body_to_json_file(body)
|
|
182
195
|
# Post the trace to the server before removing it
|
|
183
|
-
|
|
196
|
+
elif not self.evaluating:
|
|
184
197
|
self.post_trace(trace)
|
|
185
198
|
else:
|
|
186
199
|
if self.evaluation_loop:
|
|
@@ -237,6 +250,15 @@ class TraceManager:
|
|
|
237
250
|
# This is a child span, find its parent and add it to the parent's children
|
|
238
251
|
parent_span = self.get_span_by_uuid(span.parent_uuid)
|
|
239
252
|
if parent_span:
|
|
253
|
+
|
|
254
|
+
if (
|
|
255
|
+
parent_span.name == EVAL_DUMMY_SPAN_NAME
|
|
256
|
+
): # ignored span for evaluation
|
|
257
|
+
span.parent_uuid = None
|
|
258
|
+
trace.root_spans.remove(parent_span)
|
|
259
|
+
trace.root_spans.append(span)
|
|
260
|
+
return
|
|
261
|
+
|
|
240
262
|
parent_span.children.append(span)
|
|
241
263
|
else:
|
|
242
264
|
trace.root_spans.append(span)
|
|
@@ -274,10 +296,7 @@ class TraceManager:
|
|
|
274
296
|
description: Optional[str] = None,
|
|
275
297
|
environment: Optional[str] = None,
|
|
276
298
|
):
|
|
277
|
-
if (
|
|
278
|
-
os.getenv(CONFIDENT_TRACE_VERBOSE, "YES").upper() != "NO"
|
|
279
|
-
and self.evaluating is False
|
|
280
|
-
):
|
|
299
|
+
if get_settings().CONFIDENT_TRACE_VERBOSE and self.evaluating is False:
|
|
281
300
|
console = Console()
|
|
282
301
|
message_prefix = "[dim][Confident AI Trace Log][/dim]"
|
|
283
302
|
if trace_worker_status == TraceWorkerStatus.SUCCESS:
|
|
@@ -401,6 +420,7 @@ class TraceManager:
|
|
|
401
420
|
api = Api(api_key=trace_api.confident_api_key)
|
|
402
421
|
else:
|
|
403
422
|
api = Api(api_key=self.confident_api_key)
|
|
423
|
+
|
|
404
424
|
api_response, link = await api.a_send_request(
|
|
405
425
|
method=HttpMethods.POST,
|
|
406
426
|
endpoint=Endpoints.TRACES_ENDPOINT,
|
|
@@ -415,7 +435,7 @@ class TraceManager:
|
|
|
415
435
|
description=link,
|
|
416
436
|
environment=self.environment,
|
|
417
437
|
)
|
|
418
|
-
elif
|
|
438
|
+
elif self._flush_enabled:
|
|
419
439
|
# Main thread gone → to be flushed
|
|
420
440
|
remaining_trace_request_bodies.append(body)
|
|
421
441
|
|
|
@@ -492,6 +512,7 @@ class TraceManager:
|
|
|
492
512
|
with capture_send_trace():
|
|
493
513
|
try:
|
|
494
514
|
api = Api(api_key=self.confident_api_key)
|
|
515
|
+
|
|
495
516
|
_, link = api.send_request(
|
|
496
517
|
method=HttpMethods.POST,
|
|
497
518
|
endpoint=Endpoints.TRACES_ENDPOINT,
|
deepeval/tracing/utils.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import time
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
import difflib
|
|
2
7
|
from datetime import datetime, timezone
|
|
3
8
|
from enum import Enum
|
|
4
9
|
from time import perf_counter
|
|
10
|
+
import time
|
|
5
11
|
from collections import deque
|
|
12
|
+
from typing import Any, Dict, Optional, Sequence, Callable
|
|
6
13
|
|
|
7
14
|
from deepeval.constants import CONFIDENT_TRACING_ENABLED
|
|
8
15
|
|
|
@@ -14,6 +21,12 @@ class Environment(Enum):
|
|
|
14
21
|
TESTING = "testing"
|
|
15
22
|
|
|
16
23
|
|
|
24
|
+
def _strip_nul(s: str) -> str:
|
|
25
|
+
# Replace embedded NUL, which Postgres cannot store in text/jsonb
|
|
26
|
+
# Do NOT try to escape as \u0000 because PG will still reject it.
|
|
27
|
+
return s.replace("\x00", "")
|
|
28
|
+
|
|
29
|
+
|
|
17
30
|
def tracing_enabled():
|
|
18
31
|
return os.getenv(CONFIDENT_TRACING_ENABLED, "YES").upper() == "YES"
|
|
19
32
|
|
|
@@ -42,6 +55,11 @@ def make_json_serializable(obj):
|
|
|
42
55
|
|
|
43
56
|
def _serialize(o):
|
|
44
57
|
oid = id(o)
|
|
58
|
+
|
|
59
|
+
# strip Nulls
|
|
60
|
+
if isinstance(o, str):
|
|
61
|
+
return _strip_nul(o)
|
|
62
|
+
|
|
45
63
|
# Primitive types are already serializable
|
|
46
64
|
if isinstance(o, (str, int, float, bool)) or o is None:
|
|
47
65
|
return o
|
|
@@ -77,7 +95,7 @@ def make_json_serializable(obj):
|
|
|
77
95
|
return result
|
|
78
96
|
|
|
79
97
|
# Fallback: convert to string
|
|
80
|
-
return str(o)
|
|
98
|
+
return _strip_nul(str(o))
|
|
81
99
|
|
|
82
100
|
return _serialize(obj)
|
|
83
101
|
|
|
@@ -115,3 +133,82 @@ def replace_self_with_class_name(obj):
|
|
|
115
133
|
return f"<{obj.__class__.__name__}>"
|
|
116
134
|
except:
|
|
117
135
|
return f"<self>"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_deepeval_trace_mode() -> Optional[str]:
|
|
139
|
+
deepeval_trace_mode = None
|
|
140
|
+
try:
|
|
141
|
+
args = sys.argv
|
|
142
|
+
for idx, arg in enumerate(args):
|
|
143
|
+
if isinstance(arg, str) and arg.startswith(
|
|
144
|
+
"--deepeval-trace-mode="
|
|
145
|
+
):
|
|
146
|
+
deepeval_trace_mode = (
|
|
147
|
+
arg.split("=", 1)[1].strip().strip('"').strip("'").lower()
|
|
148
|
+
)
|
|
149
|
+
break
|
|
150
|
+
if arg == "--deepeval-trace-mode" and idx + 1 < len(args):
|
|
151
|
+
deepeval_trace_mode = (
|
|
152
|
+
str(args[idx + 1]).strip().strip('"').strip("'").lower()
|
|
153
|
+
)
|
|
154
|
+
break
|
|
155
|
+
except Exception:
|
|
156
|
+
deepeval_trace_mode = None
|
|
157
|
+
|
|
158
|
+
return deepeval_trace_mode
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def dump_body_to_json_file(
|
|
162
|
+
body: Dict[str, Any], file_path: Optional[str] = None
|
|
163
|
+
) -> str:
|
|
164
|
+
entry_file = None
|
|
165
|
+
try:
|
|
166
|
+
cmd0 = sys.argv[0] if sys.argv else None
|
|
167
|
+
if cmd0 and cmd0.endswith(".py"):
|
|
168
|
+
entry_file = cmd0
|
|
169
|
+
else:
|
|
170
|
+
for frame_info in reversed(inspect.stack()):
|
|
171
|
+
fp = frame_info.filename
|
|
172
|
+
if (
|
|
173
|
+
fp
|
|
174
|
+
and fp.endswith(".py")
|
|
175
|
+
and "deepeval/tracing" not in fp
|
|
176
|
+
and "site-packages" not in fp
|
|
177
|
+
):
|
|
178
|
+
entry_file = fp
|
|
179
|
+
break
|
|
180
|
+
except Exception:
|
|
181
|
+
entry_file = None
|
|
182
|
+
|
|
183
|
+
if not entry_file:
|
|
184
|
+
entry_file = "unknown.py"
|
|
185
|
+
|
|
186
|
+
abs_entry = os.path.abspath(entry_file)
|
|
187
|
+
dir_path = os.path.dirname(abs_entry)
|
|
188
|
+
|
|
189
|
+
file_arg = None
|
|
190
|
+
try:
|
|
191
|
+
for idx, arg in enumerate(sys.argv):
|
|
192
|
+
if isinstance(arg, str) and arg.startswith(
|
|
193
|
+
"--deepeval-trace-file-name="
|
|
194
|
+
):
|
|
195
|
+
file_arg = arg.split("=", 1)[1].strip().strip('"').strip("'")
|
|
196
|
+
break
|
|
197
|
+
if arg == "--deepeval-trace-file-name" and idx + 1 < len(sys.argv):
|
|
198
|
+
file_arg = str(sys.argv[idx + 1]).strip().strip('"').strip("'")
|
|
199
|
+
break
|
|
200
|
+
except Exception:
|
|
201
|
+
file_arg = None
|
|
202
|
+
|
|
203
|
+
if file_path:
|
|
204
|
+
dst_path = os.path.abspath(file_path)
|
|
205
|
+
elif file_arg:
|
|
206
|
+
dst_path = os.path.abspath(file_arg)
|
|
207
|
+
else:
|
|
208
|
+
base_name = os.path.splitext(os.path.basename(abs_entry))[0]
|
|
209
|
+
dst_path = os.path.join(dir_path, f"{base_name}.json")
|
|
210
|
+
|
|
211
|
+
actual_body = make_json_serializable(body)
|
|
212
|
+
with open(dst_path, "w", encoding="utf-8") as f:
|
|
213
|
+
json.dump(actual_body, f, ensure_ascii=False, indent=2, sort_keys=True)
|
|
214
|
+
return dst_path
|
deepeval/utils.py
CHANGED
|
@@ -1,26 +1,34 @@
|
|
|
1
|
-
from contextvars import ContextVar
|
|
2
|
-
from enum import Enum
|
|
3
1
|
import copy
|
|
4
2
|
import os
|
|
5
3
|
import json
|
|
6
4
|
import time
|
|
7
|
-
from typing import Any, Optional, Dict, List, Union
|
|
8
|
-
from collections.abc import Iterable
|
|
9
5
|
import webbrowser
|
|
10
6
|
import tqdm
|
|
11
7
|
import re
|
|
12
8
|
import string
|
|
13
|
-
from dataclasses import asdict, is_dataclass
|
|
14
|
-
import re
|
|
15
9
|
import asyncio
|
|
16
10
|
import nest_asyncio
|
|
17
11
|
import uuid
|
|
12
|
+
import math
|
|
13
|
+
|
|
14
|
+
from contextvars import ContextVar
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import Any, Optional, Dict, List, Union
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
from dataclasses import asdict, is_dataclass
|
|
18
19
|
from pydantic import BaseModel
|
|
19
20
|
from rich.progress import Progress
|
|
20
21
|
from rich.console import Console, Theme
|
|
21
22
|
|
|
22
23
|
from deepeval.confident.api import set_confident_api_key
|
|
23
24
|
from deepeval.constants import CONFIDENT_OPEN_BROWSER
|
|
25
|
+
from deepeval.config.settings import get_settings
|
|
26
|
+
from deepeval.config.utils import (
|
|
27
|
+
parse_bool,
|
|
28
|
+
get_env_bool,
|
|
29
|
+
bool_to_env_str,
|
|
30
|
+
set_env_bool,
|
|
31
|
+
)
|
|
24
32
|
|
|
25
33
|
|
|
26
34
|
def get_lcs(seq1, seq2):
|
|
@@ -140,82 +148,55 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
|
|
|
140
148
|
return loop
|
|
141
149
|
|
|
142
150
|
|
|
143
|
-
def
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
else:
|
|
148
|
-
return False
|
|
149
|
-
except:
|
|
150
|
-
return False
|
|
151
|
+
def set_should_skip_on_missing_params(yes: bool):
|
|
152
|
+
s = get_settings()
|
|
153
|
+
with s.edit(persist=False):
|
|
154
|
+
s.SKIP_DEEPEVAL_MISSING_PARAMS = yes
|
|
151
155
|
|
|
152
156
|
|
|
153
|
-
def
|
|
154
|
-
|
|
155
|
-
os.environ["SKIP_DEEPEVAL_MISSING_PARAMS"] = "YES"
|
|
156
|
-
else:
|
|
157
|
-
os.environ["SKIP_DEEPEVAL_MISSING_PARAMS"] = "NO"
|
|
157
|
+
def should_ignore_errors() -> bool:
|
|
158
|
+
return bool(get_settings().IGNORE_DEEPEVAL_ERRORS)
|
|
158
159
|
|
|
159
160
|
|
|
160
|
-
def
|
|
161
|
-
|
|
162
|
-
if os.environ["IGNORE_DEEPEVAL_ERRORS"] == "YES":
|
|
163
|
-
return True
|
|
164
|
-
else:
|
|
165
|
-
return False
|
|
166
|
-
except:
|
|
167
|
-
return False
|
|
161
|
+
def should_skip_on_missing_params() -> bool:
|
|
162
|
+
return bool(get_settings().SKIP_DEEPEVAL_MISSING_PARAMS)
|
|
168
163
|
|
|
169
164
|
|
|
170
165
|
def set_should_ignore_errors(yes: bool):
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
os.environ["IGNORE_DEEPEVAL_ERRORS"] = "NO"
|
|
166
|
+
s = get_settings()
|
|
167
|
+
with s.edit(persist=False):
|
|
168
|
+
s.IGNORE_DEEPEVAL_ERRORS = yes
|
|
175
169
|
|
|
176
170
|
|
|
177
|
-
def should_verbose_print() ->
|
|
178
|
-
|
|
179
|
-
if os.environ["DEEPEVAL_VERBOSE_MODE"] == "YES":
|
|
180
|
-
return True
|
|
181
|
-
else:
|
|
182
|
-
return None
|
|
183
|
-
except:
|
|
184
|
-
return None
|
|
171
|
+
def should_verbose_print() -> bool:
|
|
172
|
+
return bool(get_settings().DEEPEVAL_VERBOSE_MODE)
|
|
185
173
|
|
|
186
174
|
|
|
187
175
|
def set_verbose_mode(yes: Optional[bool]):
|
|
188
|
-
|
|
189
|
-
|
|
176
|
+
s = get_settings()
|
|
177
|
+
with s.edit(persist=False):
|
|
178
|
+
s.DEEPEVAL_VERBOSE_MODE = yes
|
|
190
179
|
|
|
191
180
|
|
|
192
181
|
def set_identifier(identifier: Optional[str]):
|
|
193
182
|
if identifier:
|
|
194
|
-
|
|
183
|
+
s = get_settings()
|
|
184
|
+
with s.edit(persist=False):
|
|
185
|
+
s.DEEPEVAL_IDENTIFIER = identifier
|
|
195
186
|
|
|
196
187
|
|
|
197
188
|
def get_identifier() -> Optional[str]:
|
|
198
|
-
|
|
199
|
-
return os.environ["DEEPEVAL_IDENTIFIER"]
|
|
200
|
-
except:
|
|
201
|
-
return None
|
|
189
|
+
return get_settings().DEEPEVAL_IDENTIFIER
|
|
202
190
|
|
|
203
191
|
|
|
204
|
-
def should_use_cache():
|
|
205
|
-
|
|
206
|
-
if os.environ["ENABLE_DEEPEVAL_CACHE"] == "YES":
|
|
207
|
-
return True
|
|
208
|
-
else:
|
|
209
|
-
return False
|
|
210
|
-
except:
|
|
211
|
-
return False
|
|
192
|
+
def should_use_cache() -> bool:
|
|
193
|
+
return bool(get_settings().ENABLE_DEEPEVAL_CACHE)
|
|
212
194
|
|
|
213
195
|
|
|
214
196
|
def set_should_use_cache(yes: bool):
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
os.environ["ENABLE_DEEPEVAL_CACHE"] = "NO"
|
|
197
|
+
s = get_settings()
|
|
198
|
+
with s.edit(persist=False):
|
|
199
|
+
s.ENABLE_DEEPEVAL_CACHE = yes
|
|
219
200
|
|
|
220
201
|
|
|
221
202
|
def login(api_key: str):
|
|
@@ -233,17 +214,11 @@ def login(api_key: str):
|
|
|
233
214
|
|
|
234
215
|
|
|
235
216
|
def set_is_running_deepeval(flag: bool):
|
|
236
|
-
|
|
237
|
-
os.environ["DEEPEVAL"] = "YES"
|
|
238
|
-
else:
|
|
239
|
-
os.environ["DEEPEVAL"] = "NO"
|
|
217
|
+
set_env_bool("DEEPEVAL", flag)
|
|
240
218
|
|
|
241
219
|
|
|
242
220
|
def get_is_running_deepeval() -> bool:
|
|
243
|
-
|
|
244
|
-
return os.environ["DEEPEVAL"] == "YES"
|
|
245
|
-
except:
|
|
246
|
-
return False
|
|
221
|
+
return get_env_bool("DEEPEVAL")
|
|
247
222
|
|
|
248
223
|
|
|
249
224
|
def is_in_ci_env() -> bool:
|
|
@@ -270,8 +245,8 @@ def is_in_ci_env() -> bool:
|
|
|
270
245
|
|
|
271
246
|
|
|
272
247
|
def open_browser(url: str):
|
|
273
|
-
if
|
|
274
|
-
if is_in_ci_env()
|
|
248
|
+
if get_settings().CONFIDENT_OPEN_BROWSER:
|
|
249
|
+
if not is_in_ci_env():
|
|
275
250
|
webbrowser.open(url)
|
|
276
251
|
|
|
277
252
|
|
|
@@ -439,6 +414,8 @@ def normalize_text(text: str) -> str:
|
|
|
439
414
|
|
|
440
415
|
|
|
441
416
|
def get_freer_gpu():
|
|
417
|
+
import numpy as np
|
|
418
|
+
|
|
442
419
|
os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi")
|
|
443
420
|
memory_available = [
|
|
444
421
|
int(x.split()[2]) + 5 * i
|
|
@@ -466,8 +443,11 @@ def wait_free_gpu(gb_needed):
|
|
|
466
443
|
def select_freer_gpu():
|
|
467
444
|
freer_gpu = str(get_freer_gpu())
|
|
468
445
|
print("Will use GPU: %s" % (freer_gpu))
|
|
469
|
-
|
|
470
|
-
|
|
446
|
+
|
|
447
|
+
s = get_settings()
|
|
448
|
+
with s.edit(persist=False):
|
|
449
|
+
s.CUDA_LAUNCH_BLOCKING = True
|
|
450
|
+
s.CUDA_VISIBLE_DEVICES = freer_gpu
|
|
471
451
|
return freer_gpu
|
|
472
452
|
|
|
473
453
|
|
|
@@ -535,6 +515,67 @@ def remove_pbars(
|
|
|
535
515
|
progress.remove_task(pbar_id)
|
|
536
516
|
|
|
537
517
|
|
|
518
|
+
def read_env_int(
|
|
519
|
+
name: str, default: int, *, min_value: int | None = None
|
|
520
|
+
) -> int:
|
|
521
|
+
"""Read an integer from an environment variable with safe fallback.
|
|
522
|
+
|
|
523
|
+
Attempts to read os.environ[name] and parse it as an int. If the variable
|
|
524
|
+
is unset, cannot be parsed, or is less than `min_value` (when provided),
|
|
525
|
+
the function returns `default`.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
name: Environment variable name to read.
|
|
529
|
+
default: Value to return when the env var is missing/invalid/out of range.
|
|
530
|
+
min_value: Optional inclusive lower bound; values < min_value are rejected.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
The parsed integer, or `default` on any failure.
|
|
534
|
+
"""
|
|
535
|
+
raw = os.getenv(name)
|
|
536
|
+
if raw is None:
|
|
537
|
+
return default
|
|
538
|
+
try:
|
|
539
|
+
v = int(raw)
|
|
540
|
+
if min_value is not None and v < min_value:
|
|
541
|
+
return default
|
|
542
|
+
return v
|
|
543
|
+
except Exception:
|
|
544
|
+
return default
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def read_env_float(
|
|
548
|
+
name: str, default: float, *, min_value: float | None = None
|
|
549
|
+
) -> float:
|
|
550
|
+
"""Read a float from an environment variable with safe fallback.
|
|
551
|
+
|
|
552
|
+
Attempts to read os.environ[name] and parse it as a float. If the variable
|
|
553
|
+
is unset, cannot be parsed, or is less than `min_value` (when provided),
|
|
554
|
+
the function returns `default`.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
name: Environment variable name to read.
|
|
558
|
+
default: Value to return when the env var is missing/invalid/out of range.
|
|
559
|
+
min_value: Optional inclusive lower bound; values < min_value are rejected.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
The parsed float, or `default` on any failure.
|
|
563
|
+
"""
|
|
564
|
+
raw = os.getenv(name)
|
|
565
|
+
if raw is None:
|
|
566
|
+
return default
|
|
567
|
+
try:
|
|
568
|
+
v = float(raw)
|
|
569
|
+
except Exception:
|
|
570
|
+
return default
|
|
571
|
+
|
|
572
|
+
if not math.isfinite(v):
|
|
573
|
+
return default
|
|
574
|
+
if min_value is not None and v < min_value:
|
|
575
|
+
return default
|
|
576
|
+
return v
|
|
577
|
+
|
|
578
|
+
|
|
538
579
|
my_theme = Theme(
|
|
539
580
|
{
|
|
540
581
|
"bar.complete": "#11ff00",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.9
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -25,6 +25,8 @@ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
|
|
|
25
25
|
Requires-Dist: opentelemetry-sdk (>=1.24.0,<2.0.0)
|
|
26
26
|
Requires-Dist: portalocker
|
|
27
27
|
Requires-Dist: posthog (>=6.3.0,<7.0.0)
|
|
28
|
+
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
|
29
|
+
Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
|
|
28
30
|
Requires-Dist: pyfiglet
|
|
29
31
|
Requires-Dist: pytest
|
|
30
32
|
Requires-Dist: pytest-asyncio
|