deepeval 3.6.1__py3-none-any.whl → 3.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/confident/api.py +1 -0
- deepeval/evaluate/execute.py +11 -2
- deepeval/metrics/g_eval/g_eval.py +3 -2
- deepeval/metrics/hallucination/hallucination.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/models/llms/amazon_bedrock_model.py +3 -31
- deepeval/models/llms/openai_model.py +0 -1
- deepeval/models/llms/utils.py +22 -0
- deepeval/prompt/api.py +2 -0
- deepeval/prompt/prompt.py +281 -121
- deepeval/test_case/llm_test_case.py +3 -0
- deepeval/tracing/otel/utils.py +71 -55
- deepeval/tracing/tracing.py +7 -1
- deepeval/tracing/utils.py +3 -6
- {deepeval-3.6.1.dist-info → deepeval-3.6.3.dist-info}/METADATA +2 -2
- {deepeval-3.6.1.dist-info → deepeval-3.6.3.dist-info}/RECORD +20 -20
- {deepeval-3.6.1.dist-info → deepeval-3.6.3.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.1.dist-info → deepeval-3.6.3.dist-info}/WHEEL +0 -0
- {deepeval-3.6.1.dist-info → deepeval-3.6.3.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.6.
|
|
1
|
+
__version__: str = "3.6.3"
|
deepeval/confident/api.py
CHANGED
|
@@ -90,6 +90,7 @@ class Endpoints(Enum):
|
|
|
90
90
|
TRACES_ENDPOINT = "/v1/traces"
|
|
91
91
|
ANNOTATIONS_ENDPOINT = "/v1/annotations"
|
|
92
92
|
PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
|
|
93
|
+
PROMPTS_LABEL_ENDPOINT = "/v1/prompts/:alias/labels/:label"
|
|
93
94
|
PROMPTS_ENDPOINT = "/v1/prompts"
|
|
94
95
|
PROMPTS_VERSIONS_ENDPOINT = "/v1/prompts/:alias/versions"
|
|
95
96
|
SIMULATE_ENDPOINT = "/v1/simulate"
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -836,7 +836,13 @@ def execute_agentic_test_cases(
|
|
|
836
836
|
):
|
|
837
837
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
838
838
|
loop = get_or_create_event_loop()
|
|
839
|
-
|
|
839
|
+
coro = observed_callback(golden.input)
|
|
840
|
+
loop.run_until_complete(
|
|
841
|
+
asyncio.wait_for(
|
|
842
|
+
coro,
|
|
843
|
+
timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
844
|
+
)
|
|
845
|
+
)
|
|
840
846
|
else:
|
|
841
847
|
observed_callback(golden.input)
|
|
842
848
|
current_trace: Trace = current_trace_context.get()
|
|
@@ -1190,7 +1196,10 @@ async def _a_execute_agentic_test_case(
|
|
|
1190
1196
|
_pbar_callback_id=pbar_tags_id,
|
|
1191
1197
|
):
|
|
1192
1198
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
1193
|
-
await
|
|
1199
|
+
await asyncio.wait_for(
|
|
1200
|
+
observed_callback(golden.input),
|
|
1201
|
+
timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
1202
|
+
)
|
|
1194
1203
|
else:
|
|
1195
1204
|
observed_callback(golden.input)
|
|
1196
1205
|
current_trace: Trace = current_trace_context.get()
|
|
@@ -97,7 +97,8 @@ class GEval(BaseMetric):
|
|
|
97
97
|
test_case, _additional_context=_additional_context
|
|
98
98
|
)
|
|
99
99
|
self.score = (
|
|
100
|
-
float(g_score)
|
|
100
|
+
(float(g_score) - self.score_range[0])
|
|
101
|
+
/ self.score_range_span
|
|
101
102
|
if not self.strict_mode
|
|
102
103
|
else int(g_score)
|
|
103
104
|
)
|
|
@@ -140,7 +141,7 @@ class GEval(BaseMetric):
|
|
|
140
141
|
test_case, _additional_context=_additional_context
|
|
141
142
|
)
|
|
142
143
|
self.score = (
|
|
143
|
-
float(g_score) / self.score_range_span
|
|
144
|
+
(float(g_score) - self.score_range[0]) / self.score_range_span
|
|
144
145
|
if not self.strict_mode
|
|
145
146
|
else int(g_score)
|
|
146
147
|
)
|
|
@@ -30,7 +30,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
30
30
|
threshold: float = 0.5,
|
|
31
31
|
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
32
32
|
include_reason: bool = True,
|
|
33
|
-
async_mode: bool =
|
|
33
|
+
async_mode: bool = True,
|
|
34
34
|
strict_mode: bool = False,
|
|
35
35
|
verbose_mode: bool = False,
|
|
36
36
|
evaluation_template: Type[
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List,
|
|
1
|
+
from typing import List, Dict
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
@@ -299,7 +299,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
299
299
|
def is_successful(self) -> bool:
|
|
300
300
|
try:
|
|
301
301
|
self.success = self.score >= self.threshold
|
|
302
|
-
except:
|
|
302
|
+
except (AttributeError, TypeError):
|
|
303
303
|
self.success = False
|
|
304
304
|
return self.success
|
|
305
305
|
|
|
@@ -9,7 +9,7 @@ from deepeval.models.retry_policy import (
|
|
|
9
9
|
sdk_retries_for,
|
|
10
10
|
)
|
|
11
11
|
from deepeval.models import DeepEvalBaseLLM
|
|
12
|
-
from deepeval.models.llms.utils import trim_and_load_json
|
|
12
|
+
from deepeval.models.llms.utils import trim_and_load_json, safe_asyncio_run
|
|
13
13
|
from deepeval.constants import ProviderSlug as PS
|
|
14
14
|
|
|
15
15
|
# check aiobotocore availability
|
|
@@ -40,7 +40,6 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
40
40
|
region_name: str,
|
|
41
41
|
aws_access_key_id: Optional[str] = None,
|
|
42
42
|
aws_secret_access_key: Optional[str] = None,
|
|
43
|
-
temperature: float = 0,
|
|
44
43
|
input_token_cost: float = 0,
|
|
45
44
|
output_token_cost: float = 0,
|
|
46
45
|
generation_kwargs: Optional[Dict] = None,
|
|
@@ -53,13 +52,9 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
53
52
|
self.region_name = region_name
|
|
54
53
|
self.aws_access_key_id = aws_access_key_id
|
|
55
54
|
self.aws_secret_access_key = aws_secret_access_key
|
|
56
|
-
self.temperature = temperature
|
|
57
55
|
self.input_token_cost = input_token_cost
|
|
58
56
|
self.output_token_cost = output_token_cost
|
|
59
57
|
|
|
60
|
-
if self.temperature < 0:
|
|
61
|
-
raise ValueError("Temperature must be >= 0.")
|
|
62
|
-
|
|
63
58
|
# prepare aiobotocore session, config, and async exit stack
|
|
64
59
|
self._session = get_session()
|
|
65
60
|
self._exit_stack = AsyncExitStack()
|
|
@@ -75,7 +70,7 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
75
70
|
def generate(
|
|
76
71
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
77
72
|
) -> Tuple[Union[str, Dict], float]:
|
|
78
|
-
return
|
|
73
|
+
return safe_asyncio_run(self.a_generate(prompt, schema))
|
|
79
74
|
|
|
80
75
|
@retry_bedrock
|
|
81
76
|
async def a_generate(
|
|
@@ -142,34 +137,11 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
142
137
|
###############################################
|
|
143
138
|
|
|
144
139
|
def get_converse_request_body(self, prompt: str) -> dict:
|
|
145
|
-
# Inline parameter translation with defaults
|
|
146
|
-
param_mapping = {
|
|
147
|
-
"max_tokens": "maxTokens",
|
|
148
|
-
"top_p": "topP",
|
|
149
|
-
"top_k": "topK",
|
|
150
|
-
"stop_sequences": "stopSequences",
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
# Start with defaults for required parameters
|
|
154
|
-
translated_kwargs = {
|
|
155
|
-
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
156
|
-
"topP": self.generation_kwargs.get("top_p", 0),
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
# Add any other parameters from generation_kwargs
|
|
160
|
-
for key, value in self.generation_kwargs.items():
|
|
161
|
-
if key not in [
|
|
162
|
-
"max_tokens",
|
|
163
|
-
"top_p",
|
|
164
|
-
]: # Skip already handled defaults
|
|
165
|
-
aws_key = param_mapping.get(key, key)
|
|
166
|
-
translated_kwargs[aws_key] = value
|
|
167
140
|
|
|
168
141
|
return {
|
|
169
142
|
"messages": [{"role": "user", "content": [{"text": prompt}]}],
|
|
170
143
|
"inferenceConfig": {
|
|
171
|
-
|
|
172
|
-
**translated_kwargs,
|
|
144
|
+
**self.generation_kwargs,
|
|
173
145
|
},
|
|
174
146
|
}
|
|
175
147
|
|
deepeval/models/llms/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
|
+
import asyncio
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def trim_and_load_json(
|
|
@@ -20,3 +21,24 @@ def trim_and_load_json(
|
|
|
20
21
|
raise ValueError(error_str)
|
|
21
22
|
except Exception as e:
|
|
22
23
|
raise Exception(f"An unexpected error occurred: {str(e)}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def safe_asyncio_run(coro):
|
|
27
|
+
"""
|
|
28
|
+
Run an async coroutine safely.
|
|
29
|
+
Falls back to run_until_complete if already in a running event loop.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
return asyncio.run(coro)
|
|
33
|
+
except RuntimeError:
|
|
34
|
+
try:
|
|
35
|
+
loop = asyncio.get_event_loop()
|
|
36
|
+
if loop.is_running():
|
|
37
|
+
future = asyncio.ensure_future(coro)
|
|
38
|
+
return loop.run_until_complete(future)
|
|
39
|
+
else:
|
|
40
|
+
return loop.run_until_complete(coro)
|
|
41
|
+
except Exception as inner_e:
|
|
42
|
+
raise
|
|
43
|
+
except Exception as e:
|
|
44
|
+
raise
|
deepeval/prompt/api.py
CHANGED
|
@@ -45,6 +45,8 @@ class PromptVersionsHttpResponse(BaseModel):
|
|
|
45
45
|
|
|
46
46
|
class PromptHttpResponse(BaseModel):
|
|
47
47
|
id: str
|
|
48
|
+
version: str
|
|
49
|
+
label: Optional[str] = None
|
|
48
50
|
text: Optional[str] = None
|
|
49
51
|
messages: Optional[List[PromptMessage]] = None
|
|
50
52
|
interpolation_type: PromptInterpolationType = Field(
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Optional, List, Dict
|
|
2
|
+
from typing import Literal, Optional, List, Dict
|
|
3
3
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
4
4
|
from rich.console import Console
|
|
5
5
|
import time
|
|
@@ -7,6 +7,7 @@ import json
|
|
|
7
7
|
import os
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
import asyncio
|
|
10
|
+
import portalocker
|
|
10
11
|
|
|
11
12
|
from deepeval.prompt.api import (
|
|
12
13
|
PromptHttpResponse,
|
|
@@ -25,6 +26,8 @@ from deepeval.utils import (
|
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
|
|
29
|
+
VERSION_CACHE_KEY = "version"
|
|
30
|
+
LABEL_CACHE_KEY = "label"
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
class CustomEncoder(json.JSONEncoder):
|
|
@@ -39,6 +42,7 @@ class CustomEncoder(json.JSONEncoder):
|
|
|
39
42
|
class CachedPrompt(BaseModel):
|
|
40
43
|
alias: str
|
|
41
44
|
version: str
|
|
45
|
+
label: Optional[str] = None
|
|
42
46
|
template: Optional[str]
|
|
43
47
|
messages_template: Optional[List[PromptMessage]]
|
|
44
48
|
prompt_version_id: str
|
|
@@ -50,6 +54,7 @@ class CachedPrompt(BaseModel):
|
|
|
50
54
|
|
|
51
55
|
|
|
52
56
|
class Prompt:
|
|
57
|
+
label: Optional[str] = None
|
|
53
58
|
_prompt_version_id: Optional[str] = None
|
|
54
59
|
_type: Optional[PromptType] = None
|
|
55
60
|
_interpolation_type: Optional[PromptInterpolationType] = None
|
|
@@ -73,8 +78,8 @@ class Prompt:
|
|
|
73
78
|
self._text_template = template
|
|
74
79
|
self._messages_template = messages_template
|
|
75
80
|
self._version = None
|
|
76
|
-
self._polling_tasks: Dict[str, asyncio.Task] = {}
|
|
77
|
-
self._refresh_map: Dict[str, int] = {}
|
|
81
|
+
self._polling_tasks: Dict[str, Dict[str, asyncio.Task]] = {}
|
|
82
|
+
self._refresh_map: Dict[str, Dict[str, int]] = {}
|
|
78
83
|
if template:
|
|
79
84
|
self._type = PromptType.TEXT
|
|
80
85
|
elif messages_template:
|
|
@@ -138,87 +143,173 @@ class Prompt:
|
|
|
138
143
|
return versions.text_versions or versions.messages_versions or []
|
|
139
144
|
|
|
140
145
|
def _read_from_cache(
|
|
141
|
-
self,
|
|
146
|
+
self,
|
|
147
|
+
alias: str,
|
|
148
|
+
version: Optional[str] = None,
|
|
149
|
+
label: Optional[str] = None,
|
|
142
150
|
) -> Optional[CachedPrompt]:
|
|
143
151
|
if not os.path.exists(CACHE_FILE_NAME):
|
|
144
|
-
|
|
152
|
+
return None
|
|
145
153
|
|
|
146
154
|
try:
|
|
147
|
-
|
|
155
|
+
# Use shared lock for reading to allow concurrent reads
|
|
156
|
+
with portalocker.Lock(
|
|
157
|
+
CACHE_FILE_NAME,
|
|
158
|
+
mode="r",
|
|
159
|
+
flags=portalocker.LOCK_SH | portalocker.LOCK_NB,
|
|
160
|
+
) as f:
|
|
148
161
|
cache_data = json.load(f)
|
|
149
162
|
|
|
150
163
|
if alias in cache_data:
|
|
151
164
|
if version:
|
|
152
|
-
if
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
165
|
+
if (
|
|
166
|
+
VERSION_CACHE_KEY in cache_data[alias]
|
|
167
|
+
and version in cache_data[alias][VERSION_CACHE_KEY]
|
|
168
|
+
):
|
|
169
|
+
return CachedPrompt(
|
|
170
|
+
**cache_data[alias][VERSION_CACHE_KEY][version]
|
|
157
171
|
)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
172
|
+
elif label:
|
|
173
|
+
if (
|
|
174
|
+
LABEL_CACHE_KEY in cache_data[alias]
|
|
175
|
+
and label in cache_data[alias][LABEL_CACHE_KEY]
|
|
176
|
+
):
|
|
177
|
+
return CachedPrompt(
|
|
178
|
+
**cache_data[alias][LABEL_CACHE_KEY][label]
|
|
179
|
+
)
|
|
180
|
+
return None
|
|
181
|
+
except (portalocker.exceptions.LockException, Exception):
|
|
182
|
+
# If cache is locked, corrupted or unreadable, return None and let it fetch from API
|
|
183
|
+
return None
|
|
168
184
|
|
|
169
185
|
def _write_to_cache(
|
|
170
186
|
self,
|
|
171
|
-
|
|
187
|
+
cache_key: Literal[VERSION_CACHE_KEY, LABEL_CACHE_KEY],
|
|
188
|
+
version: str,
|
|
189
|
+
label: Optional[str] = None,
|
|
172
190
|
text_template: Optional[str] = None,
|
|
173
191
|
messages_template: Optional[List[PromptMessage]] = None,
|
|
174
192
|
prompt_version_id: Optional[str] = None,
|
|
175
193
|
type: Optional[PromptType] = None,
|
|
176
194
|
interpolation_type: Optional[PromptInterpolationType] = None,
|
|
177
195
|
):
|
|
178
|
-
if not self.alias
|
|
196
|
+
if not self.alias:
|
|
179
197
|
return
|
|
180
198
|
|
|
181
|
-
cache_data = {}
|
|
182
|
-
if os.path.exists(CACHE_FILE_NAME):
|
|
183
|
-
try:
|
|
184
|
-
with open(CACHE_FILE_NAME, "r") as f:
|
|
185
|
-
cache_data = json.load(f)
|
|
186
|
-
except Exception:
|
|
187
|
-
cache_data = {}
|
|
188
|
-
|
|
189
|
-
# Ensure the cache structure is initialized properly
|
|
190
|
-
if self.alias not in cache_data:
|
|
191
|
-
cache_data[self.alias] = {}
|
|
192
|
-
|
|
193
|
-
# Cache the prompt
|
|
194
|
-
cache_data[self.alias][version] = {
|
|
195
|
-
"alias": self.alias,
|
|
196
|
-
"version": version,
|
|
197
|
-
"template": text_template,
|
|
198
|
-
"messages_template": messages_template,
|
|
199
|
-
"prompt_version_id": prompt_version_id,
|
|
200
|
-
"type": type,
|
|
201
|
-
"interpolation_type": interpolation_type,
|
|
202
|
-
}
|
|
203
|
-
|
|
204
199
|
# Ensure directory exists
|
|
205
200
|
os.makedirs(HIDDEN_DIR, exist_ok=True)
|
|
206
201
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
202
|
+
try:
|
|
203
|
+
# Use r+ mode if file exists, w mode if it doesn't
|
|
204
|
+
mode = "r+" if os.path.exists(CACHE_FILE_NAME) else "w"
|
|
205
|
+
|
|
206
|
+
with portalocker.Lock(
|
|
207
|
+
CACHE_FILE_NAME,
|
|
208
|
+
mode=mode,
|
|
209
|
+
flags=portalocker.LOCK_EX,
|
|
210
|
+
) as f:
|
|
211
|
+
# Read existing cache data if file exists and has content
|
|
212
|
+
cache_data = {}
|
|
213
|
+
if mode == "r+":
|
|
214
|
+
try:
|
|
215
|
+
f.seek(0)
|
|
216
|
+
content = f.read()
|
|
217
|
+
if content:
|
|
218
|
+
cache_data = json.loads(content)
|
|
219
|
+
except (json.JSONDecodeError, Exception):
|
|
220
|
+
cache_data = {}
|
|
221
|
+
|
|
222
|
+
# Ensure the cache structure is initialized properly
|
|
223
|
+
if self.alias not in cache_data:
|
|
224
|
+
cache_data[self.alias] = {}
|
|
225
|
+
|
|
226
|
+
if cache_key not in cache_data[self.alias]:
|
|
227
|
+
cache_data[self.alias][cache_key] = {}
|
|
228
|
+
|
|
229
|
+
# Cache the prompt
|
|
230
|
+
cached_entry = {
|
|
231
|
+
"alias": self.alias,
|
|
232
|
+
"version": version,
|
|
233
|
+
"label": label,
|
|
234
|
+
"template": text_template,
|
|
235
|
+
"messages_template": messages_template,
|
|
236
|
+
"prompt_version_id": prompt_version_id,
|
|
237
|
+
"type": type,
|
|
238
|
+
"interpolation_type": interpolation_type,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if cache_key == VERSION_CACHE_KEY:
|
|
242
|
+
cache_data[self.alias][cache_key][version] = cached_entry
|
|
243
|
+
else:
|
|
244
|
+
cache_data[self.alias][cache_key][label] = cached_entry
|
|
245
|
+
|
|
246
|
+
# Write back to cache file
|
|
247
|
+
f.seek(0)
|
|
248
|
+
f.truncate()
|
|
249
|
+
json.dump(cache_data, f, cls=CustomEncoder)
|
|
250
|
+
except portalocker.exceptions.LockException:
|
|
251
|
+
# If we can't acquire the lock, silently skip caching
|
|
252
|
+
pass
|
|
253
|
+
except Exception:
|
|
254
|
+
# If any other error occurs during caching, silently skip
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
def _load_from_cache_with_progress(
|
|
258
|
+
self,
|
|
259
|
+
progress: Progress,
|
|
260
|
+
task_id: int,
|
|
261
|
+
start_time: float,
|
|
262
|
+
version: Optional[str] = None,
|
|
263
|
+
label: Optional[str] = None,
|
|
264
|
+
):
|
|
265
|
+
"""
|
|
266
|
+
Load prompt from cache and update progress bar.
|
|
267
|
+
Raises if unable to load from cache.
|
|
268
|
+
"""
|
|
269
|
+
cached_prompt = self._read_from_cache(
|
|
270
|
+
self.alias, version=version, label=label
|
|
271
|
+
)
|
|
272
|
+
if not cached_prompt:
|
|
273
|
+
raise ValueError("Unable to fetch prompt and load from cache")
|
|
274
|
+
|
|
275
|
+
self.version = cached_prompt.version
|
|
276
|
+
self.label = cached_prompt.label
|
|
277
|
+
self._text_template = cached_prompt.template
|
|
278
|
+
self._messages_template = cached_prompt.messages_template
|
|
279
|
+
self._prompt_version_id = cached_prompt.prompt_version_id
|
|
280
|
+
self._type = PromptType(cached_prompt.type)
|
|
281
|
+
self._interpolation_type = PromptInterpolationType(
|
|
282
|
+
cached_prompt.interpolation_type
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
end_time = time.perf_counter()
|
|
286
|
+
time_taken = format(end_time - start_time, ".2f")
|
|
287
|
+
progress.update(
|
|
288
|
+
task_id,
|
|
289
|
+
description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Loaded from cache! ({time_taken}s)",
|
|
290
|
+
)
|
|
210
291
|
|
|
211
292
|
def pull(
|
|
212
293
|
self,
|
|
213
294
|
version: Optional[str] = None,
|
|
295
|
+
label: Optional[str] = None,
|
|
214
296
|
fallback_to_cache: bool = True,
|
|
215
297
|
write_to_cache: bool = True,
|
|
216
298
|
default_to_cache: bool = True,
|
|
217
299
|
refresh: Optional[int] = 60,
|
|
218
300
|
):
|
|
301
|
+
should_write_on_first_fetch = False
|
|
219
302
|
if refresh:
|
|
220
303
|
default_to_cache = True
|
|
221
|
-
|
|
304
|
+
# Check if we need to bootstrap the cache
|
|
305
|
+
cached_prompt = self._read_from_cache(
|
|
306
|
+
self.alias, version=version, label=label
|
|
307
|
+
)
|
|
308
|
+
if cached_prompt is None:
|
|
309
|
+
# No cache exists, so we should write after fetching to bootstrap
|
|
310
|
+
should_write_on_first_fetch = True
|
|
311
|
+
write_to_cache = False # Polling will handle subsequent writes
|
|
312
|
+
|
|
222
313
|
if self.alias is None:
|
|
223
314
|
raise TypeError(
|
|
224
315
|
"Unable to pull prompt from Confident AI when no alias is provided."
|
|
@@ -227,15 +318,20 @@ class Prompt:
|
|
|
227
318
|
# Manage background prompt polling
|
|
228
319
|
loop = get_or_create_general_event_loop()
|
|
229
320
|
if loop.is_running():
|
|
230
|
-
loop.create_task(self.create_polling_task(version, refresh))
|
|
321
|
+
loop.create_task(self.create_polling_task(version, label, refresh))
|
|
231
322
|
else:
|
|
232
|
-
loop.run_until_complete(
|
|
323
|
+
loop.run_until_complete(
|
|
324
|
+
self.create_polling_task(version, label, refresh)
|
|
325
|
+
)
|
|
233
326
|
|
|
234
327
|
if default_to_cache:
|
|
235
328
|
try:
|
|
236
|
-
cached_prompt = self._read_from_cache(
|
|
329
|
+
cached_prompt = self._read_from_cache(
|
|
330
|
+
self.alias, version=version, label=label
|
|
331
|
+
)
|
|
237
332
|
if cached_prompt:
|
|
238
333
|
self.version = cached_prompt.version
|
|
334
|
+
self.label = cached_prompt.label
|
|
239
335
|
self._text_template = cached_prompt.template
|
|
240
336
|
self._messages_template = cached_prompt.messages_template
|
|
241
337
|
self._prompt_version_id = cached_prompt.prompt_version_id
|
|
@@ -254,58 +350,60 @@ class Prompt:
|
|
|
254
350
|
TextColumn("[progress.description]{task.description}"),
|
|
255
351
|
transient=False,
|
|
256
352
|
) as progress:
|
|
353
|
+
HINT_TEXT = (
|
|
354
|
+
f"version='{version or 'latest'}'"
|
|
355
|
+
if not label
|
|
356
|
+
else f"label='{label}'"
|
|
357
|
+
)
|
|
257
358
|
task_id = progress.add_task(
|
|
258
|
-
f"Pulling [rgb(106,0,255)]'{self.alias}' (
|
|
359
|
+
f"Pulling [rgb(106,0,255)]'{self.alias}' ({HINT_TEXT})[/rgb(106,0,255)] from Confident AI...",
|
|
259
360
|
total=100,
|
|
260
361
|
)
|
|
362
|
+
|
|
261
363
|
start_time = time.perf_counter()
|
|
262
364
|
try:
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
365
|
+
if label:
|
|
366
|
+
data, _ = api.send_request(
|
|
367
|
+
method=HttpMethods.GET,
|
|
368
|
+
endpoint=Endpoints.PROMPTS_LABEL_ENDPOINT,
|
|
369
|
+
url_params={
|
|
370
|
+
"alias": self.alias,
|
|
371
|
+
"label": label,
|
|
372
|
+
},
|
|
373
|
+
)
|
|
374
|
+
else:
|
|
375
|
+
data, _ = api.send_request(
|
|
376
|
+
method=HttpMethods.GET,
|
|
377
|
+
endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
|
|
378
|
+
url_params={
|
|
379
|
+
"alias": self.alias,
|
|
380
|
+
"versionId": version or "latest",
|
|
381
|
+
},
|
|
382
|
+
)
|
|
383
|
+
|
|
271
384
|
response = PromptHttpResponse(
|
|
272
385
|
id=data["id"],
|
|
386
|
+
version=data.get("version", None),
|
|
387
|
+
label=data.get("label", None),
|
|
273
388
|
text=data.get("text", None),
|
|
274
389
|
messages=data.get("messages", None),
|
|
275
390
|
type=data["type"],
|
|
276
391
|
interpolation_type=data["interpolationType"],
|
|
277
392
|
)
|
|
278
|
-
except:
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
)
|
|
293
|
-
self._type = PromptType(cached_prompt.type)
|
|
294
|
-
self._interpolation_type = PromptInterpolationType(
|
|
295
|
-
cached_prompt.interpolation_type
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
end_time = time.perf_counter()
|
|
299
|
-
time_taken = format(end_time - start_time, ".2f")
|
|
300
|
-
progress.update(
|
|
301
|
-
task_id,
|
|
302
|
-
description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Loaded from cache! ({time_taken}s)",
|
|
303
|
-
)
|
|
304
|
-
return
|
|
305
|
-
except:
|
|
306
|
-
raise
|
|
307
|
-
|
|
308
|
-
self.version = version or "latest"
|
|
393
|
+
except Exception:
|
|
394
|
+
if fallback_to_cache:
|
|
395
|
+
self._load_from_cache_with_progress(
|
|
396
|
+
progress,
|
|
397
|
+
task_id,
|
|
398
|
+
start_time,
|
|
399
|
+
version=version,
|
|
400
|
+
label=label,
|
|
401
|
+
)
|
|
402
|
+
return
|
|
403
|
+
raise
|
|
404
|
+
|
|
405
|
+
self.version = response.version
|
|
406
|
+
self.label = response.label
|
|
309
407
|
self._text_template = response.text
|
|
310
408
|
self._messages_template = response.messages
|
|
311
409
|
self._prompt_version_id = response.id
|
|
@@ -318,9 +416,12 @@ class Prompt:
|
|
|
318
416
|
task_id,
|
|
319
417
|
description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Done! ({time_taken}s)",
|
|
320
418
|
)
|
|
321
|
-
if
|
|
419
|
+
# Write to cache if explicitly requested OR if we need to bootstrap cache for refresh mode
|
|
420
|
+
if write_to_cache or should_write_on_first_fetch:
|
|
322
421
|
self._write_to_cache(
|
|
323
|
-
|
|
422
|
+
cache_key=LABEL_CACHE_KEY if label else VERSION_CACHE_KEY,
|
|
423
|
+
version=response.version,
|
|
424
|
+
label=response.label,
|
|
324
425
|
text_template=response.text,
|
|
325
426
|
messages_template=response.messages,
|
|
326
427
|
prompt_version_id=response.id,
|
|
@@ -380,55 +481,114 @@ class Prompt:
|
|
|
380
481
|
async def create_polling_task(
|
|
381
482
|
self,
|
|
382
483
|
version: Optional[str],
|
|
484
|
+
label: Optional[str],
|
|
383
485
|
refresh: Optional[int] = 60,
|
|
486
|
+
default_to_cache: bool = True,
|
|
384
487
|
):
|
|
385
|
-
if version is None:
|
|
488
|
+
if version is None and label is None:
|
|
386
489
|
return
|
|
387
490
|
|
|
388
491
|
# If polling task doesn't exist, start it
|
|
389
|
-
|
|
492
|
+
CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
|
|
493
|
+
cache_value = label if label else version
|
|
494
|
+
|
|
495
|
+
# Initialize nested dicts if they don't exist
|
|
496
|
+
if CACHE_KEY not in self._polling_tasks:
|
|
497
|
+
self._polling_tasks[CACHE_KEY] = {}
|
|
498
|
+
if CACHE_KEY not in self._refresh_map:
|
|
499
|
+
self._refresh_map[CACHE_KEY] = {}
|
|
500
|
+
|
|
501
|
+
polling_task: Optional[asyncio.Task] = self._polling_tasks[
|
|
502
|
+
CACHE_KEY
|
|
503
|
+
].get(cache_value)
|
|
504
|
+
|
|
390
505
|
if refresh:
|
|
391
|
-
self._refresh_map[
|
|
506
|
+
self._refresh_map[CACHE_KEY][cache_value] = refresh
|
|
392
507
|
if not polling_task:
|
|
393
|
-
self._polling_tasks[
|
|
394
|
-
|
|
508
|
+
self._polling_tasks[CACHE_KEY][cache_value] = (
|
|
509
|
+
asyncio.create_task(
|
|
510
|
+
self.poll(version, label, default_to_cache)
|
|
511
|
+
)
|
|
395
512
|
)
|
|
396
513
|
|
|
397
514
|
# If invalid `refresh`, stop the task
|
|
398
515
|
else:
|
|
399
516
|
if polling_task:
|
|
400
517
|
polling_task.cancel()
|
|
401
|
-
self._polling_tasks
|
|
402
|
-
|
|
518
|
+
if cache_value in self._polling_tasks[CACHE_KEY]:
|
|
519
|
+
self._polling_tasks[CACHE_KEY].pop(cache_value)
|
|
520
|
+
if cache_value in self._refresh_map[CACHE_KEY]:
|
|
521
|
+
self._refresh_map[CACHE_KEY].pop(cache_value)
|
|
403
522
|
|
|
404
|
-
async def poll(
|
|
405
|
-
|
|
523
|
+
async def poll(
|
|
524
|
+
self,
|
|
525
|
+
version: Optional[str] = None,
|
|
526
|
+
label: Optional[str] = None,
|
|
527
|
+
default_to_cache: bool = True,
|
|
528
|
+
):
|
|
406
529
|
while True:
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
|
|
411
|
-
url_params={
|
|
412
|
-
"alias": self.alias,
|
|
413
|
-
"versionId": version or "latest",
|
|
414
|
-
},
|
|
530
|
+
if default_to_cache:
|
|
531
|
+
cached_prompt = self._read_from_cache(
|
|
532
|
+
self.alias, version=version, label=label
|
|
415
533
|
)
|
|
534
|
+
if cached_prompt:
|
|
535
|
+
self.version = cached_prompt.version
|
|
536
|
+
self.label = cached_prompt.label
|
|
537
|
+
self._text_template = cached_prompt.template
|
|
538
|
+
self._messages_template = cached_prompt.messages_template
|
|
539
|
+
self._prompt_version_id = cached_prompt.prompt_version_id
|
|
540
|
+
self._type = PromptType(cached_prompt.type)
|
|
541
|
+
self._interpolation_type = PromptInterpolationType(
|
|
542
|
+
cached_prompt.interpolation_type
|
|
543
|
+
)
|
|
544
|
+
return
|
|
545
|
+
|
|
546
|
+
api = Api()
|
|
547
|
+
try:
|
|
548
|
+
if label:
|
|
549
|
+
data, _ = api.send_request(
|
|
550
|
+
method=HttpMethods.GET,
|
|
551
|
+
endpoint=Endpoints.PROMPTS_LABEL_ENDPOINT,
|
|
552
|
+
url_params={
|
|
553
|
+
"alias": self.alias,
|
|
554
|
+
"label": label,
|
|
555
|
+
},
|
|
556
|
+
)
|
|
557
|
+
else:
|
|
558
|
+
data, _ = api.send_request(
|
|
559
|
+
method=HttpMethods.GET,
|
|
560
|
+
endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
|
|
561
|
+
url_params={
|
|
562
|
+
"alias": self.alias,
|
|
563
|
+
"versionId": version or "latest",
|
|
564
|
+
},
|
|
565
|
+
)
|
|
566
|
+
|
|
416
567
|
response = PromptHttpResponse(
|
|
417
568
|
id=data["id"],
|
|
569
|
+
version=data.get("version", None),
|
|
570
|
+
label=data.get("label", None),
|
|
418
571
|
text=data.get("text", None),
|
|
419
572
|
messages=data.get("messages", None),
|
|
420
573
|
type=data["type"],
|
|
421
574
|
interpolation_type=data["interpolationType"],
|
|
422
575
|
)
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
576
|
+
if default_to_cache:
|
|
577
|
+
self._write_to_cache(
|
|
578
|
+
cache_key=(
|
|
579
|
+
LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
|
|
580
|
+
),
|
|
581
|
+
version=response.version,
|
|
582
|
+
label=response.label,
|
|
583
|
+
text_template=response.text,
|
|
584
|
+
messages_template=response.messages,
|
|
585
|
+
prompt_version_id=response.id,
|
|
586
|
+
type=response.type,
|
|
587
|
+
interpolation_type=response.interpolation_type,
|
|
588
|
+
)
|
|
431
589
|
except Exception as e:
|
|
432
590
|
pass
|
|
433
591
|
|
|
434
|
-
|
|
592
|
+
CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
|
|
593
|
+
cache_value = label if label else version
|
|
594
|
+
await asyncio.sleep(self._refresh_map[CACHE_KEY][cache_value])
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from pydantic import (
|
|
2
|
+
ConfigDict,
|
|
2
3
|
Field,
|
|
3
4
|
BaseModel,
|
|
4
5
|
model_validator,
|
|
@@ -151,6 +152,8 @@ class ToolCall(BaseModel):
|
|
|
151
152
|
|
|
152
153
|
|
|
153
154
|
class LLMTestCase(BaseModel):
|
|
155
|
+
model_config = ConfigDict(extra="ignore")
|
|
156
|
+
|
|
154
157
|
input: str
|
|
155
158
|
actual_output: Optional[str] = Field(
|
|
156
159
|
default=None,
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
1
3
|
from typing import List, Optional, Tuple, Any
|
|
2
|
-
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
3
|
-
from deepeval.tracing import trace_manager, BaseSpan
|
|
4
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
5
|
-
import json
|
|
6
5
|
|
|
6
|
+
from deepeval.evaluate.utils import create_api_test_case
|
|
7
|
+
from deepeval.test_run.api import LLMApiTestCase
|
|
8
|
+
from deepeval.test_run.test_run import global_test_run_manager
|
|
9
|
+
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
10
|
+
from deepeval.tracing import trace_manager, BaseSpan
|
|
7
11
|
from deepeval.tracing.utils import make_json_serializable
|
|
8
12
|
|
|
13
|
+
|
|
9
14
|
GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "task_completion"]
|
|
10
15
|
|
|
11
16
|
|
|
@@ -107,12 +112,12 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
107
112
|
input = json.loads(span.attributes.get("gen_ai.input.messages"))
|
|
108
113
|
input = _flatten_input(input)
|
|
109
114
|
|
|
110
|
-
except Exception
|
|
115
|
+
except Exception:
|
|
111
116
|
pass
|
|
112
117
|
try:
|
|
113
118
|
output = json.loads(span.attributes.get("gen_ai.output.messages"))
|
|
114
119
|
output = _flatten_input(output)
|
|
115
|
-
except Exception
|
|
120
|
+
except Exception:
|
|
116
121
|
pass
|
|
117
122
|
|
|
118
123
|
if input is None and output is None:
|
|
@@ -126,7 +131,7 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
126
131
|
and last_event.get("event.name") == "gen_ai.choice"
|
|
127
132
|
):
|
|
128
133
|
output = last_event
|
|
129
|
-
except Exception
|
|
134
|
+
except Exception:
|
|
130
135
|
pass
|
|
131
136
|
|
|
132
137
|
return input, output
|
|
@@ -181,7 +186,7 @@ def _flatten_input(input: list) -> list:
|
|
|
181
186
|
}
|
|
182
187
|
)
|
|
183
188
|
return result
|
|
184
|
-
except Exception
|
|
189
|
+
except Exception:
|
|
185
190
|
return input
|
|
186
191
|
|
|
187
192
|
return input
|
|
@@ -192,7 +197,7 @@ def check_tool_name_from_gen_ai_attributes(span: ReadableSpan) -> Optional[str]:
|
|
|
192
197
|
gen_ai_tool_name = span.attributes.get("gen_ai.tool.name")
|
|
193
198
|
if gen_ai_tool_name:
|
|
194
199
|
return gen_ai_tool_name
|
|
195
|
-
except Exception
|
|
200
|
+
except Exception:
|
|
196
201
|
pass
|
|
197
202
|
|
|
198
203
|
return None
|
|
@@ -205,7 +210,7 @@ def check_tool_input_parameters_from_gen_ai_attributes(
|
|
|
205
210
|
tool_arguments = span.attributes.get("tool_arguments")
|
|
206
211
|
if tool_arguments:
|
|
207
212
|
return json.loads(tool_arguments)
|
|
208
|
-
except Exception
|
|
213
|
+
except Exception:
|
|
209
214
|
pass
|
|
210
215
|
|
|
211
216
|
return None
|
|
@@ -224,7 +229,7 @@ def check_span_type_from_gen_ai_attributes(span: ReadableSpan):
|
|
|
224
229
|
|
|
225
230
|
elif gen_ai_tool_name:
|
|
226
231
|
return "tool"
|
|
227
|
-
except Exception
|
|
232
|
+
except Exception:
|
|
228
233
|
pass
|
|
229
234
|
|
|
230
235
|
return "base"
|
|
@@ -235,7 +240,7 @@ def check_model_from_gen_ai_attributes(span: ReadableSpan):
|
|
|
235
240
|
gen_ai_request_model_name = span.attributes.get("gen_ai.request.model")
|
|
236
241
|
if gen_ai_request_model_name:
|
|
237
242
|
return gen_ai_request_model_name
|
|
238
|
-
except Exception
|
|
243
|
+
except Exception:
|
|
239
244
|
pass
|
|
240
245
|
|
|
241
246
|
return None
|
|
@@ -286,7 +291,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
|
|
|
286
291
|
tools_called.append(
|
|
287
292
|
ToolCall.model_validate_json(tool_call_json_str)
|
|
288
293
|
)
|
|
289
|
-
except Exception
|
|
294
|
+
except Exception:
|
|
290
295
|
pass
|
|
291
296
|
|
|
292
297
|
_expected_tools = span.attributes.get(
|
|
@@ -299,7 +304,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
|
|
|
299
304
|
expected_tools.append(
|
|
300
305
|
ToolCall.model_validate_json(tool_call_json_str)
|
|
301
306
|
)
|
|
302
|
-
except Exception
|
|
307
|
+
except Exception:
|
|
303
308
|
pass
|
|
304
309
|
|
|
305
310
|
test_case.tools_called = tools_called
|
|
@@ -328,12 +333,6 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
|
|
|
328
333
|
return parsed_context
|
|
329
334
|
|
|
330
335
|
|
|
331
|
-
from deepeval.evaluate.utils import create_api_test_case
|
|
332
|
-
from deepeval.test_run.api import LLMApiTestCase
|
|
333
|
-
from deepeval.test_run.test_run import global_test_run_manager
|
|
334
|
-
from typing import Optional
|
|
335
|
-
|
|
336
|
-
|
|
337
336
|
def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
338
337
|
# Accept single trace or list of traces
|
|
339
338
|
if isinstance(traces, Trace):
|
|
@@ -384,53 +383,70 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
|
384
383
|
# return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
|
|
385
384
|
|
|
386
385
|
|
|
386
|
+
def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
387
|
+
try:
|
|
388
|
+
raw = span.attributes.get("pydantic_ai.all_messages")
|
|
389
|
+
if not raw:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
messages = raw
|
|
393
|
+
if isinstance(messages, str):
|
|
394
|
+
messages = json.loads(messages)
|
|
395
|
+
elif isinstance(messages, tuple):
|
|
396
|
+
messages = list(messages)
|
|
397
|
+
|
|
398
|
+
if isinstance(messages, list):
|
|
399
|
+
normalized = []
|
|
400
|
+
for m in messages:
|
|
401
|
+
if isinstance(m, str):
|
|
402
|
+
try:
|
|
403
|
+
m = json.loads(m)
|
|
404
|
+
except Exception:
|
|
405
|
+
pass
|
|
406
|
+
normalized.append(m)
|
|
407
|
+
return normalized
|
|
408
|
+
except Exception:
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
return None
|
|
412
|
+
|
|
413
|
+
|
|
387
414
|
def check_pydantic_ai_agent_input_output(
|
|
388
415
|
span: ReadableSpan,
|
|
389
416
|
) -> Tuple[Optional[Any], Optional[Any]]:
|
|
390
417
|
input_val: Optional[Any] = None
|
|
391
418
|
output_val: Optional[Any] = None
|
|
392
419
|
|
|
420
|
+
# Get normalized messages once
|
|
421
|
+
normalized = _normalize_pydantic_ai_messages(span)
|
|
422
|
+
|
|
393
423
|
# Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
first_user_idx = None
|
|
414
|
-
for i, m in enumerate(normalized):
|
|
415
|
-
role = None
|
|
416
|
-
if isinstance(m, dict):
|
|
417
|
-
role = m.get("role") or m.get("author")
|
|
418
|
-
if role == "user":
|
|
419
|
-
first_user_idx = i
|
|
420
|
-
break
|
|
421
|
-
|
|
422
|
-
input_val = (
|
|
423
|
-
normalized
|
|
424
|
-
if first_user_idx is None
|
|
425
|
-
else normalized[: first_user_idx + 1]
|
|
426
|
-
)
|
|
427
|
-
except Exception:
|
|
428
|
-
pass
|
|
424
|
+
if normalized:
|
|
425
|
+
try:
|
|
426
|
+
first_user_idx = None
|
|
427
|
+
for i, m in enumerate(normalized):
|
|
428
|
+
role = None
|
|
429
|
+
if isinstance(m, dict):
|
|
430
|
+
role = m.get("role") or m.get("author")
|
|
431
|
+
if role == "user":
|
|
432
|
+
first_user_idx = i
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
input_val = (
|
|
436
|
+
normalized
|
|
437
|
+
if first_user_idx is None
|
|
438
|
+
else normalized[: first_user_idx + 1]
|
|
439
|
+
)
|
|
440
|
+
except Exception:
|
|
441
|
+
pass
|
|
429
442
|
|
|
430
443
|
# Output (agent final_result)
|
|
431
444
|
try:
|
|
432
445
|
if span.attributes.get("confident.span.type") == "agent":
|
|
433
446
|
output_val = span.attributes.get("final_result")
|
|
447
|
+
if not output_val and normalized:
|
|
448
|
+
# Extract the last message if no final_result is available
|
|
449
|
+
output_val = normalized[-1]
|
|
434
450
|
except Exception:
|
|
435
451
|
pass
|
|
436
452
|
|
|
@@ -442,7 +458,7 @@ def check_pydantic_ai_agent_input_output(
|
|
|
442
458
|
def check_tool_output(span: ReadableSpan):
|
|
443
459
|
try:
|
|
444
460
|
return span.attributes.get("tool_response")
|
|
445
|
-
except Exception
|
|
461
|
+
except Exception:
|
|
446
462
|
pass
|
|
447
463
|
return None
|
|
448
464
|
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -208,7 +208,13 @@ class TraceManager:
|
|
|
208
208
|
else:
|
|
209
209
|
# print(f"Ending trace: {trace.root_spans}")
|
|
210
210
|
self.environment = Environment.TESTING
|
|
211
|
-
|
|
211
|
+
if (
|
|
212
|
+
trace.root_spans
|
|
213
|
+
and len(trace.root_spans) > 0
|
|
214
|
+
and trace.root_spans[0].children
|
|
215
|
+
and len(trace.root_spans[0].children) > 0
|
|
216
|
+
):
|
|
217
|
+
trace.root_spans = [trace.root_spans[0].children[0]]
|
|
212
218
|
for root_span in trace.root_spans:
|
|
213
219
|
root_span.parent_uuid = None
|
|
214
220
|
|
deepeval/tracing/utils.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import time
|
|
3
2
|
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import sys
|
|
6
|
-
import difflib
|
|
7
5
|
from datetime import datetime, timezone
|
|
8
6
|
from enum import Enum
|
|
9
7
|
from time import perf_counter
|
|
10
|
-
import time
|
|
11
8
|
from collections import deque
|
|
12
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
13
10
|
|
|
14
11
|
from deepeval.constants import CONFIDENT_TRACING_ENABLED
|
|
15
12
|
|
|
@@ -189,8 +186,8 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
|
|
|
189
186
|
def replace_self_with_class_name(obj):
|
|
190
187
|
try:
|
|
191
188
|
return f"<{obj.__class__.__name__}>"
|
|
192
|
-
except:
|
|
193
|
-
return
|
|
189
|
+
except Exception:
|
|
190
|
+
return "<self>"
|
|
194
191
|
|
|
195
192
|
|
|
196
193
|
def get_deepeval_trace_mode() -> Optional[str]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.3
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -359,7 +359,7 @@ for golden in dataset.goldens:
|
|
|
359
359
|
|
|
360
360
|
@pytest.mark.parametrize(
|
|
361
361
|
"test_case",
|
|
362
|
-
dataset,
|
|
362
|
+
dataset.test_cases,
|
|
363
363
|
)
|
|
364
364
|
def test_customer_chatbot(test_case: LLMTestCase):
|
|
365
365
|
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
|
|
2
|
-
deepeval/_version.py,sha256=
|
|
2
|
+
deepeval/_version.py,sha256=1BsEnmEpD1mtVjCYoXBeguVgrKPAi3TRpS_a7ndu4XU,27
|
|
3
3
|
deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
|
|
4
4
|
deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
|
|
5
5
|
deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
|
|
@@ -138,7 +138,7 @@ deepeval/cli/test.py,sha256=kSIFMRTAfVzBJ4OitwvT829-ylV7UzPMP57P2DePS-Q,5482
|
|
|
138
138
|
deepeval/cli/types.py,sha256=_7KdthstHNc-JKCWrfpDQCf_j8h9PMxh0qJCHmVXJr0,310
|
|
139
139
|
deepeval/cli/utils.py,sha256=F4-yuONzk4ojDoSLjI9RYERB7HOD412iZ2lNlSCq4wk,5601
|
|
140
140
|
deepeval/confident/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
141
|
-
deepeval/confident/api.py,sha256=
|
|
141
|
+
deepeval/confident/api.py,sha256=2ZhrQOtfxcnQSyY6OxrjY17y1yn-NB7pfIiJa20B1Pk,8519
|
|
142
142
|
deepeval/confident/types.py,sha256=-slFhDof_1maMgpLxqDRZv6kz6ZVY2hP_0uj_aveJKU,533
|
|
143
143
|
deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
144
|
deepeval/config/settings.py,sha256=gRRi6nXEUKse13xAShU9MA18zo14vpIgl_R0xJ_0vnM,21314
|
|
@@ -159,7 +159,7 @@ deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
|
|
|
159
159
|
deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
|
|
160
160
|
deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
|
|
161
161
|
deepeval/evaluate/evaluate.py,sha256=NPAJ2iJqJI_RurXKUIC0tft_ozYMIKwZf5iPfmnNhQc,10412
|
|
162
|
-
deepeval/evaluate/execute.py,sha256=
|
|
162
|
+
deepeval/evaluate/execute.py,sha256=XS0XtDGKC1ZOo09lthillfi5aDI5TWFbJ-Y7yICNvGo,89056
|
|
163
163
|
deepeval/evaluate/types.py,sha256=IGZ3Xsj0UecPI3JNeTpJaK1gDvlepokfCmHwtItIW9M,831
|
|
164
164
|
deepeval/evaluate/utils.py,sha256=kkliSGzuICeUsXDtlMMPfN95dUKlqarNhfciSffd4gI,23143
|
|
165
165
|
deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -240,12 +240,12 @@ deepeval/metrics/faithfulness/faithfulness.py,sha256=bYVhHI7Tr7xH0x-7F2LijxRuCCE
|
|
|
240
240
|
deepeval/metrics/faithfulness/schema.py,sha256=2dU9dwwmqpGJcWvY2webERWIfH_tn02xgLghHkAY_eM,437
|
|
241
241
|
deepeval/metrics/faithfulness/template.py,sha256=RuZ0LFm4BjZ8lhVrKPgU3ecHszwkF0fe5-BxAkaP5AA,5839
|
|
242
242
|
deepeval/metrics/g_eval/__init__.py,sha256=HAhsQFVq9LIpZXPN00Jc_WrMXrh47NIT86VnUpWM4_4,102
|
|
243
|
-
deepeval/metrics/g_eval/g_eval.py,sha256=
|
|
243
|
+
deepeval/metrics/g_eval/g_eval.py,sha256=CaW7VHPW-SyXt18IE1rSatgagY238s3It-j6SLRI4H4,14395
|
|
244
244
|
deepeval/metrics/g_eval/schema.py,sha256=V629txuDrr_2IEKEsgJVYYZb_pkdfcltQV9ZjvxK5co,287
|
|
245
245
|
deepeval/metrics/g_eval/template.py,sha256=mHj4-mr_HQwbCjpHg7lM_6UesoSatL3g8UGGQAOdT0U,4509
|
|
246
246
|
deepeval/metrics/g_eval/utils.py,sha256=uUT86jRXVYvLDzcnZvvfWssDyGoBHb66nWcJSg4i1u4,8784
|
|
247
247
|
deepeval/metrics/hallucination/__init__.py,sha256=rCVlHi2UGzDKmZKi0esFLafmshVBx2WZ0jiIb-KqcYQ,44
|
|
248
|
-
deepeval/metrics/hallucination/hallucination.py,sha256=
|
|
248
|
+
deepeval/metrics/hallucination/hallucination.py,sha256=8JN5pj5YWRtl7rgbbFQF6EVBCGm1NV9vaX3_5tScNs4,9548
|
|
249
249
|
deepeval/metrics/hallucination/schema.py,sha256=V8xbrBLMwJfre-lPuDc7rMEdhHf_1hfgoW1jE_ULvAY,286
|
|
250
250
|
deepeval/metrics/hallucination/template.py,sha256=hiss1soxSBFqzOt0KmHZdZUzoQsmXnslDyb8HsjALPs,2620
|
|
251
251
|
deepeval/metrics/indicator.py,sha256=oewo_n5Qet9Zfzo2QQs-EQ8w92siuyDCAmoTZW45ndc,10244
|
|
@@ -348,7 +348,7 @@ deepeval/metrics/task_completion/schema.py,sha256=JfnZkbCh7skWvrESy65GEo6Rvo0FDJ
|
|
|
348
348
|
deepeval/metrics/task_completion/task_completion.py,sha256=RKFkXCVOhO70I8A16zv5BCaV3QVKldNxawJ0T93U_Zc,8978
|
|
349
349
|
deepeval/metrics/task_completion/template.py,sha256=4xjTBcGrPQxInbf8iwJOZyok9SQex1aCkbxKmfkXoA4,10437
|
|
350
350
|
deepeval/metrics/tool_correctness/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
|
-
deepeval/metrics/tool_correctness/tool_correctness.py,sha256=
|
|
351
|
+
deepeval/metrics/tool_correctness/tool_correctness.py,sha256=j5wB9mJp7BLbn3bTZd6LlIeub1kXxXGaDVWrzyvBFo4,12111
|
|
352
352
|
deepeval/metrics/toxicity/__init__.py,sha256=1lgt8BKxfBDd7bfSLu_5kMzmsr9b2_ahPK9oq5zLkMk,39
|
|
353
353
|
deepeval/metrics/toxicity/schema.py,sha256=7uUdzXqTvIIz5nfahlllo_fzVRXg7UeMeXn7Hl32pKY,459
|
|
354
354
|
deepeval/metrics/toxicity/template.py,sha256=zl4y4Tg9gXkxKJ8aXVwj0cJ94pvfYuP7MTeV3dvB5yQ,5045
|
|
@@ -370,7 +370,7 @@ deepeval/models/embedding_models/ollama_embedding_model.py,sha256=w3etdIdWvYfVIE
|
|
|
370
370
|
deepeval/models/embedding_models/openai_embedding_model.py,sha256=Z1--e3CnNNmwryqmUMxBCaTURjtgKWHqADuUeCqFlSc,3545
|
|
371
371
|
deepeval/models/hallucination_model.py,sha256=ABi978VKLE_jNHbDzM96kJ08EsZ5ZlvOlJHA_ptSkfQ,1003
|
|
372
372
|
deepeval/models/llms/__init__.py,sha256=qmvv7wnmTDvys2uUTwQRo-_3DlFV3fGLiewPeQYRsAI,670
|
|
373
|
-
deepeval/models/llms/amazon_bedrock_model.py,sha256=
|
|
373
|
+
deepeval/models/llms/amazon_bedrock_model.py,sha256=3yiUUGU_d_YK7Usq8v5iqG3yHa5VnqeDOoCLG_p8rtc,5185
|
|
374
374
|
deepeval/models/llms/anthropic_model.py,sha256=5gYRNkYUD7Zl3U0SibBG2YGCQsD6DdTsaBhqdaJlKIw,6072
|
|
375
375
|
deepeval/models/llms/azure_model.py,sha256=dqINcfoJNqdd9zh5iTPwQ_ToGMOF7iH6YUB-UWRSOlc,10730
|
|
376
376
|
deepeval/models/llms/deepseek_model.py,sha256=EqBJkKa7rXppCmlnIt_D-Z_r9fbsOUsOAVvN2jWA-Hk,6404
|
|
@@ -380,8 +380,8 @@ deepeval/models/llms/kimi_model.py,sha256=ldTefdSVitZYJJQ-_ZsP87iiT5iZ4QCVdfi-Yz
|
|
|
380
380
|
deepeval/models/llms/litellm_model.py,sha256=iu4-_JCpd9LdEa-eCWseD2iLTA-r7OSgYGWQ0IxB4eA,11527
|
|
381
381
|
deepeval/models/llms/local_model.py,sha256=hEyKVA6pkQm9dICUKsMNgjVI3w6gnyMdmBt_EylkWDk,4473
|
|
382
382
|
deepeval/models/llms/ollama_model.py,sha256=xPO4d4jMY-cQAyHAcMuFvWS8JMWwCUbKP9CMi838Nuc,3307
|
|
383
|
-
deepeval/models/llms/openai_model.py,sha256=
|
|
384
|
-
deepeval/models/llms/utils.py,sha256=
|
|
383
|
+
deepeval/models/llms/openai_model.py,sha256=mUvQ8a9FVk4lrdZyS_QRZTK4imufyaCNjZFPeqbc0AM,17167
|
|
384
|
+
deepeval/models/llms/utils.py,sha256=gFM_8eIvdSwN_D4Yqp-j7PkfoiRn_bgu7tlCHol3A6c,1324
|
|
385
385
|
deepeval/models/mlllms/__init__.py,sha256=19nN6kUB5XI0nUWUQX0aD9GBUMM8WWGvsDgKjuT4EF4,144
|
|
386
386
|
deepeval/models/mlllms/gemini_model.py,sha256=7tHIWD4w_fBz3L7jkKWygn1QpBPk9nl2Kw-yb0Jc3PI,10167
|
|
387
387
|
deepeval/models/mlllms/ollama_model.py,sha256=_YtYtw8oIMVVI-CFsDicsdeEJUPhw_9ArPxB_1olsJA,4798
|
|
@@ -404,8 +404,8 @@ deepeval/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
404
404
|
deepeval/plugins/plugin.py,sha256=_dwsdx4Dg9DbXxK3f7zJY4QWTJQWc7QE1HmIg2Zjjag,1515
|
|
405
405
|
deepeval/progress_context.py,sha256=ZSKpxrE9sdgt9G3REKnVeXAv7GJXHHVGgLynpG1Pudw,3557
|
|
406
406
|
deepeval/prompt/__init__.py,sha256=M99QTWdxOfiNeySGCSqN873Q80PPxqRvjLq4_Mw-X1w,49
|
|
407
|
-
deepeval/prompt/api.py,sha256=
|
|
408
|
-
deepeval/prompt/prompt.py,sha256=
|
|
407
|
+
deepeval/prompt/api.py,sha256=665mLKiq8irXWV8kM9P_qFJipdCYZUNQFwW8AkA3itM,1777
|
|
408
|
+
deepeval/prompt/prompt.py,sha256=w2BmKtSzXxobjSlBQqUjdAB0Zwe6IYaLjLg7KQvVDXE,21999
|
|
409
409
|
deepeval/prompt/utils.py,sha256=Ermw9P-1-T5wQ5uYuj5yWgdj7pVB_JLw8D37Qvmh9ok,1938
|
|
410
410
|
deepeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
411
411
|
deepeval/red_teaming/README.md,sha256=BY5rAdpp3-sMMToEKwq0Nsd9ivkGDzPE16DeDb8GY7U,154
|
|
@@ -434,7 +434,7 @@ deepeval/telemetry.py,sha256=JPZw1VBJ5dGiS8k-dzWs5OhMbNlr65QgVretTy33WCg,21704
|
|
|
434
434
|
deepeval/test_case/__init__.py,sha256=hLkHxGH0-FFhx4MlJwIbzNHL4pgyLGquh8l0qD-z_cQ,731
|
|
435
435
|
deepeval/test_case/arena_test_case.py,sha256=PcfDxadlc4yW4AEDdvN32AeUpx2Sms1jvnbX31Xu65o,957
|
|
436
436
|
deepeval/test_case/conversational_test_case.py,sha256=lF0V1yCGCInQetggm2wbXx-MkuMRs2ScwqIXCSwb1Fs,7534
|
|
437
|
-
deepeval/test_case/llm_test_case.py,sha256=
|
|
437
|
+
deepeval/test_case/llm_test_case.py,sha256=L-dCvJ4pMPPavZTyN9ZKN30h351DWI_TunmXfHPIjig,12180
|
|
438
438
|
deepeval/test_case/mcp.py,sha256=Z625NLvz0E_UJpbyfyuAi_4nsqKH6DByBf0rfKd70xU,1879
|
|
439
439
|
deepeval/test_case/mllm_test_case.py,sha256=8a0YoE72geX_fLI6yk_cObSxCPddwW-DOb-5OPE1-W8,5414
|
|
440
440
|
deepeval/test_case/utils.py,sha256=5lT7QmhItsQHt44-qQfspuktilcrEyvl2cS0cgUJxds,809
|
|
@@ -454,15 +454,15 @@ deepeval/tracing/offline_evals/thread.py,sha256=bcSGFcZJKnszArOLIlWvnCyt0zSmsd7X
|
|
|
454
454
|
deepeval/tracing/offline_evals/trace.py,sha256=vTflaTKysKRiYvKA-Nx6PUJ3J6NrRLXiIdWieVcm90E,1868
|
|
455
455
|
deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx3HEk,88
|
|
456
456
|
deepeval/tracing/otel/exporter.py,sha256=wPO1ITKpjueLOSNLO6nD2QL9LAd8Xcu6en8hRkB61Wo,28891
|
|
457
|
-
deepeval/tracing/otel/utils.py,sha256=
|
|
457
|
+
deepeval/tracing/otel/utils.py,sha256=yAXyPvTjax2HdLcvbVv9pyOVW4S7elIp3RLGuBTr_8o,15113
|
|
458
458
|
deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
|
|
459
459
|
deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
|
|
460
|
-
deepeval/tracing/tracing.py,sha256=
|
|
460
|
+
deepeval/tracing/tracing.py,sha256=xZEyuxdGY259nQaDkGp_qO7Avriv8hrf4L15ZfeMNV8,42728
|
|
461
461
|
deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
|
|
462
|
-
deepeval/tracing/utils.py,sha256=
|
|
462
|
+
deepeval/tracing/utils.py,sha256=SLnks8apGlrV6uVnvFVl2mWYABEkvXbPXnQvq3KaU_o,7943
|
|
463
463
|
deepeval/utils.py,sha256=-_o3W892u7naX4Y7a8if4mP0Rtkgtapg6Krr1ZBpj0o,17197
|
|
464
|
-
deepeval-3.6.
|
|
465
|
-
deepeval-3.6.
|
|
466
|
-
deepeval-3.6.
|
|
467
|
-
deepeval-3.6.
|
|
468
|
-
deepeval-3.6.
|
|
464
|
+
deepeval-3.6.3.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
|
|
465
|
+
deepeval-3.6.3.dist-info/METADATA,sha256=BoRZ6BEBPwkypse9Xzw8gRlsezwSrDKsT5RO9C3thQc,18754
|
|
466
|
+
deepeval-3.6.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
467
|
+
deepeval-3.6.3.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
|
|
468
|
+
deepeval-3.6.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|