deepeval 3.6.9__py3-none-any.whl → 3.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +159 -11
- deepeval/config/settings_manager.py +4 -0
- deepeval/evaluate/compare.py +215 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/metrics/utils.py +1 -1
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +27 -15
- deepeval/simulator/template.py +1 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +35 -13
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +52 -14
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +11 -2
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +48 -2
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/METADATA +3 -3
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/RECORD +68 -58
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/WHEEL +0 -0
- {deepeval-3.6.9.dist-info → deepeval-3.7.1.dist-info}/entry_points.txt +0 -0
|
@@ -11,33 +11,50 @@ from deepeval.test_case import ToolCall
|
|
|
11
11
|
|
|
12
12
|
@dataclass
|
|
13
13
|
class MLLMImage:
|
|
14
|
-
|
|
14
|
+
dataBase64: Optional[str] = None
|
|
15
|
+
mimeType: Optional[str] = None
|
|
16
|
+
url: Optional[str] = None
|
|
15
17
|
local: Optional[bool] = None
|
|
16
|
-
filename: Optional[str] =
|
|
17
|
-
mimeType: Optional[str] = field(default=None, init=False, repr=False)
|
|
18
|
-
dataBase64: Optional[str] = field(default=None, init=False, repr=False)
|
|
18
|
+
filename: Optional[str] = None
|
|
19
19
|
|
|
20
20
|
def __post_init__(self):
|
|
21
|
-
|
|
22
|
-
if self.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
self.filename = os.path.basename(path)
|
|
31
|
-
self.mimeType = (
|
|
32
|
-
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
21
|
+
|
|
22
|
+
if self.url and self.dataBase64:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
"You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if not self.url and not self.dataBase64:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
"You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
|
|
33
30
|
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
self.
|
|
31
|
+
|
|
32
|
+
if self.dataBase64 is not None:
|
|
33
|
+
if self.mimeType is None:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
"mimeType must be provided when initializing from Base64 data."
|
|
36
|
+
)
|
|
37
37
|
else:
|
|
38
|
-
|
|
39
|
-
self.
|
|
40
|
-
|
|
38
|
+
is_local = self.is_local_path(self.url)
|
|
39
|
+
if self.local is not None:
|
|
40
|
+
assert self.local == is_local, "Local path mismatch"
|
|
41
|
+
else:
|
|
42
|
+
self.local = is_local
|
|
43
|
+
|
|
44
|
+
# compute filename, mime_type, and Base64 data
|
|
45
|
+
if self.local:
|
|
46
|
+
path = self.process_url(self.url)
|
|
47
|
+
self.filename = os.path.basename(path)
|
|
48
|
+
self.mimeType = (
|
|
49
|
+
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
50
|
+
)
|
|
51
|
+
with open(path, "rb") as f:
|
|
52
|
+
raw = f.read()
|
|
53
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
54
|
+
else:
|
|
55
|
+
self.filename = None
|
|
56
|
+
self.mimeType = None
|
|
57
|
+
self.dataBase64 = None
|
|
41
58
|
|
|
42
59
|
@staticmethod
|
|
43
60
|
def process_url(url: str) -> str:
|
|
@@ -69,6 +86,12 @@ class MLLMImage:
|
|
|
69
86
|
return os.path.exists(path)
|
|
70
87
|
return False
|
|
71
88
|
|
|
89
|
+
def as_data_uri(self) -> Optional[str]:
|
|
90
|
+
"""Return the image as a data URI string, if Base64 data is available."""
|
|
91
|
+
if not self.dataBase64 or not self.mimeType:
|
|
92
|
+
return None
|
|
93
|
+
return f"data:{self.mimeType};base64,{self.dataBase64}"
|
|
94
|
+
|
|
72
95
|
|
|
73
96
|
class MLLMTestCaseParams(Enum):
|
|
74
97
|
INPUT = "input"
|
deepeval/test_run/api.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
2
|
from typing import Optional, List, Union, Dict
|
|
3
3
|
|
|
4
4
|
from deepeval.test_case import MLLMImage, ToolCall
|
|
5
5
|
from deepeval.tracing.api import TraceApi, MetricData
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class LLMApiTestCase(BaseModel):
|
|
@@ -49,7 +50,7 @@ class LLMApiTestCase(BaseModel):
|
|
|
49
50
|
comments: Optional[str] = Field(None)
|
|
50
51
|
trace: Optional[TraceApi] = Field(None)
|
|
51
52
|
|
|
52
|
-
model_config =
|
|
53
|
+
model_config = make_model_config(arbitrary_types_allowed=True)
|
|
53
54
|
# metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
54
55
|
|
|
55
56
|
def update_metric_data(self, metric_data: MetricData):
|
deepeval/test_run/cache.py
CHANGED
|
@@ -1,25 +1,44 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
2
|
import sys
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
|
-
from typing import List, Optional,
|
|
5
|
+
from typing import List, Optional, Dict, Union
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
|
+
from deepeval.utils import make_model_config
|
|
10
|
+
|
|
9
11
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
|
|
10
12
|
from deepeval.test_run.api import MetricData
|
|
11
13
|
from deepeval.utils import (
|
|
12
14
|
delete_file_if_exists,
|
|
15
|
+
is_read_only_env,
|
|
13
16
|
serialize,
|
|
14
17
|
)
|
|
15
18
|
from deepeval.metrics import BaseMetric
|
|
16
19
|
from deepeval.constants import HIDDEN_DIR
|
|
17
20
|
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
portalocker = None
|
|
26
|
+
if not is_read_only_env():
|
|
27
|
+
try:
|
|
28
|
+
import portalocker
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.warning("failed to import portalocker: %s", e)
|
|
31
|
+
else:
|
|
32
|
+
logger.warning("READ_ONLY filesystem: skipping disk cache for test runs.")
|
|
33
|
+
|
|
34
|
+
|
|
18
35
|
CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-cache.json"
|
|
19
36
|
TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
|
|
20
37
|
|
|
21
38
|
|
|
22
39
|
class MetricConfiguration(BaseModel):
|
|
40
|
+
model_config = make_model_config(arbitrary_types_allowed=True)
|
|
41
|
+
|
|
23
42
|
##### Required fields #####
|
|
24
43
|
threshold: float
|
|
25
44
|
evaluation_model: Optional[str] = None
|
|
@@ -36,9 +55,6 @@ class MetricConfiguration(BaseModel):
|
|
|
36
55
|
Union[List[LLMTestCaseParams], List[ToolCallParams]]
|
|
37
56
|
] = None
|
|
38
57
|
|
|
39
|
-
class Config:
|
|
40
|
-
arbitrary_types_allowed = True
|
|
41
|
-
|
|
42
58
|
|
|
43
59
|
class CachedMetricData(BaseModel):
|
|
44
60
|
metric_data: MetricData
|
|
@@ -96,7 +112,7 @@ class TestRunCacheManager:
|
|
|
96
112
|
def get_cached_test_case(
|
|
97
113
|
self, test_case: LLMTestCase, hyperparameters: Union[Dict, None]
|
|
98
114
|
) -> Union[CachedTestCase, None]:
|
|
99
|
-
if self.disable_write_cache:
|
|
115
|
+
if self.disable_write_cache or portalocker is None:
|
|
100
116
|
return None
|
|
101
117
|
|
|
102
118
|
cached_test_run = self.get_cached_test_run()
|
|
@@ -121,7 +137,7 @@ class TestRunCacheManager:
|
|
|
121
137
|
hyperparameters: Union[Dict, None],
|
|
122
138
|
to_temp: bool = False,
|
|
123
139
|
):
|
|
124
|
-
if self.disable_write_cache:
|
|
140
|
+
if self.disable_write_cache or portalocker is None:
|
|
125
141
|
return
|
|
126
142
|
cache_dict = {
|
|
127
143
|
LLMTestCaseParams.INPUT.value: test_case.input,
|
|
@@ -141,7 +157,7 @@ class TestRunCacheManager:
|
|
|
141
157
|
def set_cached_test_run(
|
|
142
158
|
self, cached_test_run: CachedTestRun, temp: bool = False
|
|
143
159
|
):
|
|
144
|
-
if self.disable_write_cache:
|
|
160
|
+
if self.disable_write_cache or portalocker is None:
|
|
145
161
|
return
|
|
146
162
|
|
|
147
163
|
if temp:
|
|
@@ -150,7 +166,7 @@ class TestRunCacheManager:
|
|
|
150
166
|
self.cached_test_run = cached_test_run
|
|
151
167
|
|
|
152
168
|
def save_cached_test_run(self, to_temp: bool = False):
|
|
153
|
-
if self.disable_write_cache:
|
|
169
|
+
if self.disable_write_cache or portalocker is None:
|
|
154
170
|
return
|
|
155
171
|
|
|
156
172
|
if to_temp:
|
|
@@ -177,7 +193,7 @@ class TestRunCacheManager:
|
|
|
177
193
|
)
|
|
178
194
|
|
|
179
195
|
def create_cached_test_run(self, temp: bool = False):
|
|
180
|
-
if self.disable_write_cache:
|
|
196
|
+
if self.disable_write_cache or portalocker is None:
|
|
181
197
|
return
|
|
182
198
|
|
|
183
199
|
cached_test_run = CachedTestRun()
|
|
@@ -187,7 +203,7 @@ class TestRunCacheManager:
|
|
|
187
203
|
def get_cached_test_run(
|
|
188
204
|
self, from_temp: bool = False
|
|
189
205
|
) -> Union[CachedTestRun, None]:
|
|
190
|
-
if self.disable_write_cache:
|
|
206
|
+
if self.disable_write_cache or portalocker is None:
|
|
191
207
|
return
|
|
192
208
|
|
|
193
209
|
should_create_cached_test_run = False
|
|
@@ -208,7 +224,7 @@ class TestRunCacheManager:
|
|
|
208
224
|
try:
|
|
209
225
|
data = json.loads(content)
|
|
210
226
|
self.temp_cached_test_run = CachedTestRun.load(data)
|
|
211
|
-
except Exception
|
|
227
|
+
except Exception:
|
|
212
228
|
should_create_cached_test_run = True
|
|
213
229
|
except portalocker.exceptions.LockException as e:
|
|
214
230
|
print(
|
|
@@ -216,6 +232,9 @@ class TestRunCacheManager:
|
|
|
216
232
|
file=sys.stderr,
|
|
217
233
|
)
|
|
218
234
|
|
|
235
|
+
if should_create_cached_test_run:
|
|
236
|
+
self.create_cached_test_run(temp=from_temp)
|
|
237
|
+
|
|
219
238
|
return self.temp_cached_test_run
|
|
220
239
|
else:
|
|
221
240
|
if self.cached_test_run:
|
|
@@ -249,6 +268,9 @@ class TestRunCacheManager:
|
|
|
249
268
|
return self.cached_test_run
|
|
250
269
|
|
|
251
270
|
def wrap_up_cached_test_run(self):
|
|
271
|
+
if portalocker is None:
|
|
272
|
+
return
|
|
273
|
+
|
|
252
274
|
if self.disable_write_cache:
|
|
253
275
|
# Clear cache if write cache is disabled
|
|
254
276
|
delete_file_if_exists(self.cache_file_name)
|
|
@@ -329,7 +351,7 @@ class Cache:
|
|
|
329
351
|
if criteria_value != cached_criteria_value:
|
|
330
352
|
return False
|
|
331
353
|
continue
|
|
332
|
-
except:
|
|
354
|
+
except Exception:
|
|
333
355
|
# For non-GEval
|
|
334
356
|
continue
|
|
335
357
|
|
|
@@ -33,7 +33,11 @@ def process_hyperparameters(
|
|
|
33
33
|
)
|
|
34
34
|
|
|
35
35
|
if isinstance(value, Prompt):
|
|
36
|
-
|
|
36
|
+
try:
|
|
37
|
+
prompt_key = f"{value.alias}_{value.version}"
|
|
38
|
+
except AttributeError:
|
|
39
|
+
prompt_key = f"{value.alias}_00.00.01"
|
|
40
|
+
|
|
37
41
|
if value._prompt_version_id is not None and value.type is not None:
|
|
38
42
|
processed_hyperparameters[key] = PromptApi(
|
|
39
43
|
id=value._prompt_version_id,
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -6,11 +6,11 @@ from typing import Any, Optional, List, Dict, Union, Tuple
|
|
|
6
6
|
import shutil
|
|
7
7
|
import sys
|
|
8
8
|
import datetime
|
|
9
|
-
import portalocker
|
|
10
9
|
from rich.table import Table
|
|
11
10
|
from rich.console import Console
|
|
12
11
|
from rich import print
|
|
13
12
|
|
|
13
|
+
|
|
14
14
|
from deepeval.metrics import BaseMetric
|
|
15
15
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
16
16
|
from deepeval.test_run.api import (
|
|
@@ -25,6 +25,7 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
|
|
|
25
25
|
from deepeval.utils import (
|
|
26
26
|
delete_file_if_exists,
|
|
27
27
|
get_is_running_deepeval,
|
|
28
|
+
is_read_only_env,
|
|
28
29
|
open_browser,
|
|
29
30
|
shorten,
|
|
30
31
|
format_turn,
|
|
@@ -42,6 +43,21 @@ from rich.panel import Panel
|
|
|
42
43
|
from rich.columns import Columns
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
portalocker = None
|
|
47
|
+
if not is_read_only_env():
|
|
48
|
+
try:
|
|
49
|
+
import portalocker
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(
|
|
52
|
+
f"Warning: failed to import portalocker: {e}",
|
|
53
|
+
file=sys.stderr,
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
print(
|
|
57
|
+
"Warning: DeepEval is configured for read only environment. Test runs will not be written to disk."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
45
61
|
TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
|
|
46
62
|
LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
|
|
47
63
|
LATEST_TEST_RUN_DATA_KEY = "testRunData"
|
|
@@ -456,26 +472,36 @@ class TestRunManager:
|
|
|
456
472
|
if self.test_run is None:
|
|
457
473
|
self.create_test_run(identifier=identifier)
|
|
458
474
|
|
|
459
|
-
if self.save_to_disk:
|
|
475
|
+
if portalocker and self.save_to_disk:
|
|
460
476
|
try:
|
|
461
477
|
with portalocker.Lock(
|
|
462
478
|
self.temp_file_path,
|
|
463
479
|
mode="r",
|
|
464
480
|
flags=portalocker.LOCK_SH | portalocker.LOCK_NB,
|
|
465
481
|
) as file:
|
|
466
|
-
|
|
482
|
+
loaded = self.test_run.load(file)
|
|
483
|
+
# only overwrite if loading actually worked
|
|
484
|
+
self.test_run = loaded
|
|
467
485
|
except (
|
|
468
486
|
FileNotFoundError,
|
|
487
|
+
json.JSONDecodeError,
|
|
469
488
|
portalocker.exceptions.LockException,
|
|
470
489
|
) as e:
|
|
471
|
-
print(
|
|
472
|
-
|
|
490
|
+
print(
|
|
491
|
+
f"Warning: Could not load test run from disk: {e}",
|
|
492
|
+
file=sys.stderr,
|
|
493
|
+
)
|
|
473
494
|
|
|
474
495
|
return self.test_run
|
|
475
496
|
|
|
476
497
|
def save_test_run(self, path: str, save_under_key: Optional[str] = None):
|
|
477
|
-
if self.save_to_disk:
|
|
498
|
+
if portalocker and self.save_to_disk:
|
|
478
499
|
try:
|
|
500
|
+
# ensure parent directory exists
|
|
501
|
+
parent = os.path.dirname(path)
|
|
502
|
+
if parent:
|
|
503
|
+
os.makedirs(parent, exist_ok=True)
|
|
504
|
+
|
|
479
505
|
with portalocker.Lock(path, mode="w") as file:
|
|
480
506
|
if save_under_key:
|
|
481
507
|
try:
|
|
@@ -495,11 +521,14 @@ class TestRunManager:
|
|
|
495
521
|
pass
|
|
496
522
|
|
|
497
523
|
def save_final_test_run_link(self, link: str):
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
524
|
+
if portalocker:
|
|
525
|
+
try:
|
|
526
|
+
with portalocker.Lock(
|
|
527
|
+
LATEST_TEST_RUN_FILE_PATH, mode="w"
|
|
528
|
+
) as file:
|
|
529
|
+
json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
|
|
530
|
+
except portalocker.exceptions.LockException:
|
|
531
|
+
pass
|
|
503
532
|
|
|
504
533
|
def update_test_run(
|
|
505
534
|
self,
|
|
@@ -513,7 +542,7 @@ class TestRunManager:
|
|
|
513
542
|
):
|
|
514
543
|
return
|
|
515
544
|
|
|
516
|
-
if self.save_to_disk:
|
|
545
|
+
if portalocker and self.save_to_disk:
|
|
517
546
|
try:
|
|
518
547
|
with portalocker.Lock(
|
|
519
548
|
self.temp_file_path,
|
|
@@ -533,10 +562,19 @@ class TestRunManager:
|
|
|
533
562
|
self.test_run.save(file)
|
|
534
563
|
except (
|
|
535
564
|
FileNotFoundError,
|
|
565
|
+
json.JSONDecodeError,
|
|
536
566
|
portalocker.exceptions.LockException,
|
|
537
567
|
) as e:
|
|
538
|
-
print(
|
|
539
|
-
|
|
568
|
+
print(
|
|
569
|
+
f"Warning: Could not update test run on disk: {e}",
|
|
570
|
+
file=sys.stderr,
|
|
571
|
+
)
|
|
572
|
+
if self.test_run is None:
|
|
573
|
+
# guarantee a valid in-memory run so the update can proceed.
|
|
574
|
+
# never destroy in-memory state on I/O failure.
|
|
575
|
+
self.create_test_run()
|
|
576
|
+
self.test_run.add_test_case(api_test_case)
|
|
577
|
+
self.test_run.set_dataset_properties(test_case)
|
|
540
578
|
else:
|
|
541
579
|
if self.test_run is None:
|
|
542
580
|
self.create_test_run()
|
deepeval/tracing/api.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Dict, List, Optional, Union, Literal, Any
|
|
3
|
-
from pydantic import BaseModel,
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
from deepeval.test_case import ToolCall
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class SpanApiType(Enum):
|
|
@@ -27,7 +28,7 @@ class PromptApi(BaseModel):
|
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class MetricData(BaseModel):
|
|
30
|
-
model_config =
|
|
31
|
+
model_config = make_model_config(extra="ignore")
|
|
31
32
|
|
|
32
33
|
name: str
|
|
33
34
|
threshold: float
|
|
@@ -42,6 +43,10 @@ class MetricData(BaseModel):
|
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
class BaseApiSpan(BaseModel):
|
|
46
|
+
model_config = make_model_config(
|
|
47
|
+
use_enum_values=True, validate_assignment=True
|
|
48
|
+
)
|
|
49
|
+
|
|
45
50
|
uuid: str
|
|
46
51
|
name: str = None
|
|
47
52
|
status: TraceSpanApiStatus
|
|
@@ -96,12 +101,12 @@ class BaseApiSpan(BaseModel):
|
|
|
96
101
|
metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
97
102
|
metrics_data: Optional[List[MetricData]] = Field(None, alias="metricsData")
|
|
98
103
|
|
|
99
|
-
class Config:
|
|
100
|
-
use_enum_values = True
|
|
101
|
-
validate_assignment = True
|
|
102
|
-
|
|
103
104
|
|
|
104
105
|
class TraceApi(BaseModel):
|
|
106
|
+
model_config = make_model_config(
|
|
107
|
+
use_enum_values=True, validate_assignment=True
|
|
108
|
+
)
|
|
109
|
+
|
|
105
110
|
uuid: str
|
|
106
111
|
base_spans: Optional[List[BaseApiSpan]] = Field(None, alias="baseSpans")
|
|
107
112
|
agent_spans: Optional[List[BaseApiSpan]] = Field(None, alias="agentSpans")
|
|
@@ -139,7 +144,3 @@ class TraceApi(BaseModel):
|
|
|
139
144
|
|
|
140
145
|
# Don't serialize these
|
|
141
146
|
confident_api_key: Optional[str] = Field(None, exclude=True)
|
|
142
|
-
|
|
143
|
-
class Config:
|
|
144
|
-
use_enum_values = True
|
|
145
|
-
validate_assignment = True
|
|
@@ -493,6 +493,17 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
493
493
|
output_token_count = span.attributes.get(
|
|
494
494
|
"confident.llm.output_token_count"
|
|
495
495
|
)
|
|
496
|
+
|
|
497
|
+
# fallback to gen ai attributes if not found in confident attributes
|
|
498
|
+
if not input_token_count:
|
|
499
|
+
input_token_count = span.attributes.get(
|
|
500
|
+
"gen_ai.usage.input_tokens"
|
|
501
|
+
)
|
|
502
|
+
if not output_token_count:
|
|
503
|
+
output_token_count = span.attributes.get(
|
|
504
|
+
"gen_ai.usage.output_tokens"
|
|
505
|
+
)
|
|
506
|
+
|
|
496
507
|
cost_per_input_token = span.attributes.get(
|
|
497
508
|
"confident.llm.cost_per_input_token"
|
|
498
509
|
)
|
deepeval/tracing/patchers.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
from openai import OpenAI
|
|
2
1
|
import functools
|
|
3
2
|
|
|
3
|
+
from anthropic import Anthropic
|
|
4
|
+
from openai import OpenAI
|
|
5
|
+
|
|
4
6
|
from deepeval.tracing.context import update_current_span, update_llm_span
|
|
5
7
|
from deepeval.tracing.context import current_span_context
|
|
6
8
|
from deepeval.tracing.types import LlmSpan
|
|
@@ -82,3 +84,102 @@ def patch_openai_client(client: OpenAI):
|
|
|
82
84
|
return response
|
|
83
85
|
|
|
84
86
|
setattr(current_obj, method_name, wrapped_method)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def patch_anthropic_client(client: Anthropic):
|
|
90
|
+
"""
|
|
91
|
+
Patch an Anthropic client instance to add tracing capabilities.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
client: An instance of Anthropic client to patch
|
|
95
|
+
"""
|
|
96
|
+
original_methods = {}
|
|
97
|
+
|
|
98
|
+
methods_to_patch = [
|
|
99
|
+
"messages.create",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
for method_path in methods_to_patch:
|
|
103
|
+
parts = method_path.split(".")
|
|
104
|
+
current_obj = client
|
|
105
|
+
|
|
106
|
+
for part in parts[:-1]:
|
|
107
|
+
if not hasattr(current_obj, part):
|
|
108
|
+
print(f"Warning: Cannot find {part} in the path {method_path}")
|
|
109
|
+
continue
|
|
110
|
+
current_obj = getattr(current_obj, part)
|
|
111
|
+
|
|
112
|
+
method_name = parts[-1]
|
|
113
|
+
if not hasattr(current_obj, method_name):
|
|
114
|
+
print(
|
|
115
|
+
f"Warning: Cannot find method {method_name} in the path {method_path}"
|
|
116
|
+
)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
method = getattr(current_obj, method_name)
|
|
120
|
+
|
|
121
|
+
if callable(method) and not isinstance(method, type):
|
|
122
|
+
original_methods[method_path] = method
|
|
123
|
+
|
|
124
|
+
@functools.wraps(method)
|
|
125
|
+
def wrapped_method(*args, original_method=method, **kwargs):
|
|
126
|
+
current_span = current_span_context.get()
|
|
127
|
+
response = original_method(*args, **kwargs)
|
|
128
|
+
|
|
129
|
+
if isinstance(current_span, LlmSpan):
|
|
130
|
+
model = kwargs.get("model", None)
|
|
131
|
+
if model is None:
|
|
132
|
+
raise ValueError("model not found in client")
|
|
133
|
+
|
|
134
|
+
current_span.model = model
|
|
135
|
+
|
|
136
|
+
output = None
|
|
137
|
+
try:
|
|
138
|
+
if (
|
|
139
|
+
hasattr(response, "content")
|
|
140
|
+
and response.content
|
|
141
|
+
and len(response.content) > 0
|
|
142
|
+
):
|
|
143
|
+
for block in response.content:
|
|
144
|
+
if hasattr(block, "text"):
|
|
145
|
+
output = block.text
|
|
146
|
+
break
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
input_token_count = None
|
|
151
|
+
output_token_count = None
|
|
152
|
+
try:
|
|
153
|
+
if hasattr(response, "usage"):
|
|
154
|
+
usage = response.usage
|
|
155
|
+
# usage can be a dict or an object with attributes
|
|
156
|
+
if isinstance(usage, dict):
|
|
157
|
+
input_token_count = usage.get(
|
|
158
|
+
"input_tokens", None
|
|
159
|
+
)
|
|
160
|
+
output_token_count = usage.get(
|
|
161
|
+
"output_tokens", None
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
input_token_count = getattr(
|
|
165
|
+
usage, "input_tokens", None
|
|
166
|
+
)
|
|
167
|
+
output_token_count = getattr(
|
|
168
|
+
usage, "output_tokens", None
|
|
169
|
+
)
|
|
170
|
+
except Exception:
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
update_current_span(
|
|
174
|
+
input=kwargs.get("messages", "INPUT_MESSAGE_NOT_FOUND"),
|
|
175
|
+
output=output if output else "OUTPUT_MESSAGE_NOT_FOUND",
|
|
176
|
+
)
|
|
177
|
+
update_llm_span(
|
|
178
|
+
input_token_count=input_token_count,
|
|
179
|
+
output_token_count=output_token_count,
|
|
180
|
+
)
|
|
181
|
+
return response
|
|
182
|
+
|
|
183
|
+
setattr(current_obj, method_name, wrapped_method)
|
|
184
|
+
|
|
185
|
+
return original_methods
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
from typing import Optional, List, Dict, Any
|
|
2
1
|
from contextvars import ContextVar
|
|
3
2
|
from contextlib import contextmanager
|
|
4
3
|
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, List, Dict, Any
|
|
5
5
|
|
|
6
|
-
from .tracing import trace_manager
|
|
7
|
-
from .context import current_trace_context, update_current_trace
|
|
8
|
-
from deepeval.prompt import Prompt
|
|
9
6
|
from deepeval.metrics import BaseMetric
|
|
7
|
+
from deepeval.prompt import Prompt
|
|
10
8
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
9
|
+
from deepeval.tracing.context import current_trace_context, update_current_trace
|
|
10
|
+
from deepeval.tracing.tracing import trace_manager
|
|
11
|
+
from deepeval.tracing.types import TraceWorkerStatus
|
|
12
|
+
from deepeval.tracing.utils import is_async_context
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@dataclass
|
|
@@ -59,6 +61,13 @@ def trace(
|
|
|
59
61
|
metrics: Optional[List[BaseMetric]] = None,
|
|
60
62
|
metric_collection: Optional[str] = None,
|
|
61
63
|
):
|
|
64
|
+
if is_async_context():
|
|
65
|
+
trace_manager._print_trace_status(
|
|
66
|
+
message="Warning: Detected use of the synchronous 'trace' context manager within an async method",
|
|
67
|
+
trace_worker_status=TraceWorkerStatus.WARNING,
|
|
68
|
+
description="Wrapping an async method with the synchronous 'trace' context manager may lead to unexpected behavior.",
|
|
69
|
+
)
|
|
70
|
+
|
|
62
71
|
current_trace = current_trace_context.get()
|
|
63
72
|
|
|
64
73
|
if not current_trace:
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -19,6 +19,7 @@ import random
|
|
|
19
19
|
import atexit
|
|
20
20
|
import queue
|
|
21
21
|
import uuid
|
|
22
|
+
from anthropic import Anthropic
|
|
22
23
|
from openai import OpenAI
|
|
23
24
|
from rich.console import Console
|
|
24
25
|
from rich.progress import Progress
|
|
@@ -38,7 +39,10 @@ from deepeval.tracing.api import (
|
|
|
38
39
|
TraceSpanApiStatus,
|
|
39
40
|
)
|
|
40
41
|
from deepeval.telemetry import capture_send_trace
|
|
41
|
-
from deepeval.tracing.patchers import
|
|
42
|
+
from deepeval.tracing.patchers import (
|
|
43
|
+
patch_anthropic_client,
|
|
44
|
+
patch_openai_client,
|
|
45
|
+
)
|
|
42
46
|
from deepeval.tracing.types import (
|
|
43
47
|
AgentSpan,
|
|
44
48
|
BaseSpan,
|
|
@@ -111,6 +115,7 @@ class TraceManager:
|
|
|
111
115
|
|
|
112
116
|
self.sampling_rate = settings.CONFIDENT_TRACE_SAMPLE_RATE
|
|
113
117
|
validate_sampling_rate(self.sampling_rate)
|
|
118
|
+
self.anthropic_client = None
|
|
114
119
|
self.openai_client = None
|
|
115
120
|
self.tracing_enabled = True
|
|
116
121
|
|
|
@@ -139,7 +144,7 @@ class TraceManager:
|
|
|
139
144
|
|
|
140
145
|
def mask(self, data: Any):
|
|
141
146
|
if self.custom_mask_fn is not None:
|
|
142
|
-
self.custom_mask_fn(data)
|
|
147
|
+
return self.custom_mask_fn(data)
|
|
143
148
|
else:
|
|
144
149
|
return data
|
|
145
150
|
|
|
@@ -149,6 +154,7 @@ class TraceManager:
|
|
|
149
154
|
environment: Optional[str] = None,
|
|
150
155
|
sampling_rate: Optional[float] = None,
|
|
151
156
|
confident_api_key: Optional[str] = None,
|
|
157
|
+
anthropic_client: Optional[Anthropic] = None,
|
|
152
158
|
openai_client: Optional[OpenAI] = None,
|
|
153
159
|
tracing_enabled: Optional[bool] = None,
|
|
154
160
|
) -> None:
|
|
@@ -165,6 +171,9 @@ class TraceManager:
|
|
|
165
171
|
if openai_client is not None:
|
|
166
172
|
self.openai_client = openai_client
|
|
167
173
|
patch_openai_client(openai_client)
|
|
174
|
+
if anthropic_client is not None:
|
|
175
|
+
self.anthropic_client = anthropic_client
|
|
176
|
+
patch_anthropic_client(anthropic_client)
|
|
168
177
|
if tracing_enabled is not None:
|
|
169
178
|
self.tracing_enabled = tracing_enabled
|
|
170
179
|
|