deepeval 3.6.2__py3-none-any.whl → 3.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.2"
1
+ __version__: str = "3.6.3"
deepeval/confident/api.py CHANGED
@@ -90,6 +90,7 @@ class Endpoints(Enum):
90
90
  TRACES_ENDPOINT = "/v1/traces"
91
91
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
92
  PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
93
+ PROMPTS_LABEL_ENDPOINT = "/v1/prompts/:alias/labels/:label"
93
94
  PROMPTS_ENDPOINT = "/v1/prompts"
94
95
  PROMPTS_VERSIONS_ENDPOINT = "/v1/prompts/:alias/versions"
95
96
  SIMULATE_ENDPOINT = "/v1/simulate"
@@ -97,7 +97,8 @@ class GEval(BaseMetric):
97
97
  test_case, _additional_context=_additional_context
98
98
  )
99
99
  self.score = (
100
- float(g_score) / self.score_range_span
100
+ (float(g_score) - self.score_range[0])
101
+ / self.score_range_span
101
102
  if not self.strict_mode
102
103
  else int(g_score)
103
104
  )
@@ -140,7 +141,7 @@ class GEval(BaseMetric):
140
141
  test_case, _additional_context=_additional_context
141
142
  )
142
143
  self.score = (
143
- float(g_score) / self.score_range_span
144
+ (float(g_score) - self.score_range[0]) / self.score_range_span
144
145
  if not self.strict_mode
145
146
  else int(g_score)
146
147
  )
@@ -1,4 +1,4 @@
1
- from typing import List, Union, Dict
1
+ from typing import List, Dict
2
2
 
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
4
  from deepeval.metrics.utils import (
@@ -152,14 +152,19 @@ class ToolCorrectnessMetric(BaseMetric):
152
152
 
153
153
  # Calculate score
154
154
  def _calculate_score(self):
155
- # Fix: handle empty expected_tools to avoid ZeroDivisionError
156
- if len(self.expected_tools) == 0:
157
- score = 1.0 if len(self.tools_called) == 0 else 0.0
158
- elif self.should_exact_match:
155
+ if self.should_exact_match:
159
156
  score = self._calculate_exact_match_score()
160
157
  elif self.should_consider_ordering:
161
158
  _, weighted_length = self._compute_weighted_lcs()
162
- score = weighted_length / len(self.expected_tools)
159
+ if (
160
+ len(self.tools_called) == len(self.expected_tools)
161
+ and len(self.expected_tools) == 0
162
+ ):
163
+ score = 1.0
164
+ elif len(self.expected_tools) == 0:
165
+ score = 0.0
166
+ else:
167
+ score = weighted_length / len(self.expected_tools)
163
168
  else:
164
169
  score = self._calculate_non_exact_match_score()
165
170
  return 0 if self.strict_mode and score < self.threshold else score
@@ -294,7 +299,7 @@ class ToolCorrectnessMetric(BaseMetric):
294
299
  def is_successful(self) -> bool:
295
300
  try:
296
301
  self.success = self.score >= self.threshold
297
- except:
302
+ except (AttributeError, TypeError):
298
303
  self.success = False
299
304
  return self.success
300
305
 
@@ -9,7 +9,7 @@ from deepeval.models.retry_policy import (
9
9
  sdk_retries_for,
10
10
  )
11
11
  from deepeval.models import DeepEvalBaseLLM
12
- from deepeval.models.llms.utils import trim_and_load_json
12
+ from deepeval.models.llms.utils import trim_and_load_json, safe_asyncio_run
13
13
  from deepeval.constants import ProviderSlug as PS
14
14
 
15
15
  # check aiobotocore availability
@@ -40,7 +40,6 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
40
40
  region_name: str,
41
41
  aws_access_key_id: Optional[str] = None,
42
42
  aws_secret_access_key: Optional[str] = None,
43
- temperature: float = 0,
44
43
  input_token_cost: float = 0,
45
44
  output_token_cost: float = 0,
46
45
  generation_kwargs: Optional[Dict] = None,
@@ -53,13 +52,9 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
53
52
  self.region_name = region_name
54
53
  self.aws_access_key_id = aws_access_key_id
55
54
  self.aws_secret_access_key = aws_secret_access_key
56
- self.temperature = temperature
57
55
  self.input_token_cost = input_token_cost
58
56
  self.output_token_cost = output_token_cost
59
57
 
60
- if self.temperature < 0:
61
- raise ValueError("Temperature must be >= 0.")
62
-
63
58
  # prepare aiobotocore session, config, and async exit stack
64
59
  self._session = get_session()
65
60
  self._exit_stack = AsyncExitStack()
@@ -75,7 +70,7 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
75
70
  def generate(
76
71
  self, prompt: str, schema: Optional[BaseModel] = None
77
72
  ) -> Tuple[Union[str, Dict], float]:
78
- return asyncio.run(self.a_generate(prompt, schema))
73
+ return safe_asyncio_run(self.a_generate(prompt, schema))
79
74
 
80
75
  @retry_bedrock
81
76
  async def a_generate(
@@ -142,34 +137,11 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
142
137
  ###############################################
143
138
 
144
139
  def get_converse_request_body(self, prompt: str) -> dict:
145
- # Inline parameter translation with defaults
146
- param_mapping = {
147
- "max_tokens": "maxTokens",
148
- "top_p": "topP",
149
- "top_k": "topK",
150
- "stop_sequences": "stopSequences",
151
- }
152
-
153
- # Start with defaults for required parameters
154
- translated_kwargs = {
155
- "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
156
- "topP": self.generation_kwargs.get("top_p", 0),
157
- }
158
-
159
- # Add any other parameters from generation_kwargs
160
- for key, value in self.generation_kwargs.items():
161
- if key not in [
162
- "max_tokens",
163
- "top_p",
164
- ]: # Skip already handled defaults
165
- aws_key = param_mapping.get(key, key)
166
- translated_kwargs[aws_key] = value
167
140
 
168
141
  return {
169
142
  "messages": [{"role": "user", "content": [{"text": prompt}]}],
170
143
  "inferenceConfig": {
171
- "temperature": self.temperature,
172
- **translated_kwargs,
144
+ **self.generation_kwargs,
173
145
  },
174
146
  }
175
147
 
@@ -204,7 +204,6 @@ models_requiring_temperature_1 = [
204
204
  "gpt-5-mini-2025-08-07",
205
205
  "gpt-5-nano",
206
206
  "gpt-5-nano-2025-08-07",
207
- "gpt-5-chat-latest",
208
207
  ]
209
208
 
210
209
 
@@ -1,6 +1,7 @@
1
1
  from typing import Dict
2
2
  import re
3
3
  import json
4
+ import asyncio
4
5
 
5
6
 
6
7
  def trim_and_load_json(
@@ -20,3 +21,24 @@ def trim_and_load_json(
20
21
  raise ValueError(error_str)
21
22
  except Exception as e:
22
23
  raise Exception(f"An unexpected error occurred: {str(e)}")
24
+
25
+
26
+ def safe_asyncio_run(coro):
27
+ """
28
+ Run an async coroutine safely.
29
+ Falls back to run_until_complete if already in a running event loop.
30
+ """
31
+ try:
32
+ return asyncio.run(coro)
33
+ except RuntimeError:
34
+ try:
35
+ loop = asyncio.get_event_loop()
36
+ if loop.is_running():
37
+ future = asyncio.ensure_future(coro)
38
+ return loop.run_until_complete(future)
39
+ else:
40
+ return loop.run_until_complete(coro)
41
+ except Exception as inner_e:
42
+ raise
43
+ except Exception as e:
44
+ raise
deepeval/prompt/api.py CHANGED
@@ -45,6 +45,8 @@ class PromptVersionsHttpResponse(BaseModel):
45
45
 
46
46
  class PromptHttpResponse(BaseModel):
47
47
  id: str
48
+ version: str
49
+ label: Optional[str] = None
48
50
  text: Optional[str] = None
49
51
  messages: Optional[List[PromptMessage]] = None
50
52
  interpolation_type: PromptInterpolationType = Field(
deepeval/prompt/prompt.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Optional, List, Dict
2
+ from typing import Literal, Optional, List, Dict
3
3
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
4
4
  from rich.console import Console
5
5
  import time
@@ -7,6 +7,7 @@ import json
7
7
  import os
8
8
  from pydantic import BaseModel
9
9
  import asyncio
10
+ import portalocker
10
11
 
11
12
  from deepeval.prompt.api import (
12
13
  PromptHttpResponse,
@@ -25,6 +26,8 @@ from deepeval.utils import (
25
26
  )
26
27
 
27
28
  CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
29
+ VERSION_CACHE_KEY = "version"
30
+ LABEL_CACHE_KEY = "label"
28
31
 
29
32
 
30
33
  class CustomEncoder(json.JSONEncoder):
@@ -39,6 +42,7 @@ class CustomEncoder(json.JSONEncoder):
39
42
  class CachedPrompt(BaseModel):
40
43
  alias: str
41
44
  version: str
45
+ label: Optional[str] = None
42
46
  template: Optional[str]
43
47
  messages_template: Optional[List[PromptMessage]]
44
48
  prompt_version_id: str
@@ -50,6 +54,7 @@ class CachedPrompt(BaseModel):
50
54
 
51
55
 
52
56
  class Prompt:
57
+ label: Optional[str] = None
53
58
  _prompt_version_id: Optional[str] = None
54
59
  _type: Optional[PromptType] = None
55
60
  _interpolation_type: Optional[PromptInterpolationType] = None
@@ -73,8 +78,8 @@ class Prompt:
73
78
  self._text_template = template
74
79
  self._messages_template = messages_template
75
80
  self._version = None
76
- self._polling_tasks: Dict[str, asyncio.Task] = {}
77
- self._refresh_map: Dict[str, int] = {}
81
+ self._polling_tasks: Dict[str, Dict[str, asyncio.Task]] = {}
82
+ self._refresh_map: Dict[str, Dict[str, int]] = {}
78
83
  if template:
79
84
  self._type = PromptType.TEXT
80
85
  elif messages_template:
@@ -138,87 +143,173 @@ class Prompt:
138
143
  return versions.text_versions or versions.messages_versions or []
139
144
 
140
145
  def _read_from_cache(
141
- self, alias: str, version: Optional[str] = None
146
+ self,
147
+ alias: str,
148
+ version: Optional[str] = None,
149
+ label: Optional[str] = None,
142
150
  ) -> Optional[CachedPrompt]:
143
151
  if not os.path.exists(CACHE_FILE_NAME):
144
- raise Exception("No Prompt cache file found")
152
+ return None
145
153
 
146
154
  try:
147
- with open(CACHE_FILE_NAME, "r") as f:
155
+ # Use shared lock for reading to allow concurrent reads
156
+ with portalocker.Lock(
157
+ CACHE_FILE_NAME,
158
+ mode="r",
159
+ flags=portalocker.LOCK_SH | portalocker.LOCK_NB,
160
+ ) as f:
148
161
  cache_data = json.load(f)
149
162
 
150
163
  if alias in cache_data:
151
164
  if version:
152
- if version in cache_data[alias]:
153
- return CachedPrompt(**cache_data[alias][version])
154
- else:
155
- raise Exception(
156
- f"Unable to find Prompt version: '{version}' for alias: '{alias}' in cache"
165
+ if (
166
+ VERSION_CACHE_KEY in cache_data[alias]
167
+ and version in cache_data[alias][VERSION_CACHE_KEY]
168
+ ):
169
+ return CachedPrompt(
170
+ **cache_data[alias][VERSION_CACHE_KEY][version]
157
171
  )
158
- else:
159
- raise Exception(
160
- f"Unable to load Prompt with alias: '{alias}' from cache when no version is specified "
161
- )
162
- else:
163
- raise Exception(
164
- f"Unable to find Prompt with alias: '{alias}' in cache"
165
- )
166
- except Exception as e:
167
- raise Exception(f"Error reading Prompt cache from disk: {e}")
172
+ elif label:
173
+ if (
174
+ LABEL_CACHE_KEY in cache_data[alias]
175
+ and label in cache_data[alias][LABEL_CACHE_KEY]
176
+ ):
177
+ return CachedPrompt(
178
+ **cache_data[alias][LABEL_CACHE_KEY][label]
179
+ )
180
+ return None
181
+ except (portalocker.exceptions.LockException, Exception):
182
+ # If cache is locked, corrupted or unreadable, return None and let it fetch from API
183
+ return None
168
184
 
169
185
  def _write_to_cache(
170
186
  self,
171
- version: Optional[str] = None,
187
+ cache_key: Literal[VERSION_CACHE_KEY, LABEL_CACHE_KEY],
188
+ version: str,
189
+ label: Optional[str] = None,
172
190
  text_template: Optional[str] = None,
173
191
  messages_template: Optional[List[PromptMessage]] = None,
174
192
  prompt_version_id: Optional[str] = None,
175
193
  type: Optional[PromptType] = None,
176
194
  interpolation_type: Optional[PromptInterpolationType] = None,
177
195
  ):
178
- if not self.alias or not version:
196
+ if not self.alias:
179
197
  return
180
198
 
181
- cache_data = {}
182
- if os.path.exists(CACHE_FILE_NAME):
183
- try:
184
- with open(CACHE_FILE_NAME, "r") as f:
185
- cache_data = json.load(f)
186
- except Exception:
187
- cache_data = {}
188
-
189
- # Ensure the cache structure is initialized properly
190
- if self.alias not in cache_data:
191
- cache_data[self.alias] = {}
192
-
193
- # Cache the prompt
194
- cache_data[self.alias][version] = {
195
- "alias": self.alias,
196
- "version": version,
197
- "template": text_template,
198
- "messages_template": messages_template,
199
- "prompt_version_id": prompt_version_id,
200
- "type": type,
201
- "interpolation_type": interpolation_type,
202
- }
203
-
204
199
  # Ensure directory exists
205
200
  os.makedirs(HIDDEN_DIR, exist_ok=True)
206
201
 
207
- # Write back to cache file
208
- with open(CACHE_FILE_NAME, "w") as f:
209
- json.dump(cache_data, f, cls=CustomEncoder)
202
+ try:
203
+ # Use r+ mode if file exists, w mode if it doesn't
204
+ mode = "r+" if os.path.exists(CACHE_FILE_NAME) else "w"
205
+
206
+ with portalocker.Lock(
207
+ CACHE_FILE_NAME,
208
+ mode=mode,
209
+ flags=portalocker.LOCK_EX,
210
+ ) as f:
211
+ # Read existing cache data if file exists and has content
212
+ cache_data = {}
213
+ if mode == "r+":
214
+ try:
215
+ f.seek(0)
216
+ content = f.read()
217
+ if content:
218
+ cache_data = json.loads(content)
219
+ except (json.JSONDecodeError, Exception):
220
+ cache_data = {}
221
+
222
+ # Ensure the cache structure is initialized properly
223
+ if self.alias not in cache_data:
224
+ cache_data[self.alias] = {}
225
+
226
+ if cache_key not in cache_data[self.alias]:
227
+ cache_data[self.alias][cache_key] = {}
228
+
229
+ # Cache the prompt
230
+ cached_entry = {
231
+ "alias": self.alias,
232
+ "version": version,
233
+ "label": label,
234
+ "template": text_template,
235
+ "messages_template": messages_template,
236
+ "prompt_version_id": prompt_version_id,
237
+ "type": type,
238
+ "interpolation_type": interpolation_type,
239
+ }
240
+
241
+ if cache_key == VERSION_CACHE_KEY:
242
+ cache_data[self.alias][cache_key][version] = cached_entry
243
+ else:
244
+ cache_data[self.alias][cache_key][label] = cached_entry
245
+
246
+ # Write back to cache file
247
+ f.seek(0)
248
+ f.truncate()
249
+ json.dump(cache_data, f, cls=CustomEncoder)
250
+ except portalocker.exceptions.LockException:
251
+ # If we can't acquire the lock, silently skip caching
252
+ pass
253
+ except Exception:
254
+ # If any other error occurs during caching, silently skip
255
+ pass
256
+
257
+ def _load_from_cache_with_progress(
258
+ self,
259
+ progress: Progress,
260
+ task_id: int,
261
+ start_time: float,
262
+ version: Optional[str] = None,
263
+ label: Optional[str] = None,
264
+ ):
265
+ """
266
+ Load prompt from cache and update progress bar.
267
+ Raises if unable to load from cache.
268
+ """
269
+ cached_prompt = self._read_from_cache(
270
+ self.alias, version=version, label=label
271
+ )
272
+ if not cached_prompt:
273
+ raise ValueError("Unable to fetch prompt and load from cache")
274
+
275
+ self.version = cached_prompt.version
276
+ self.label = cached_prompt.label
277
+ self._text_template = cached_prompt.template
278
+ self._messages_template = cached_prompt.messages_template
279
+ self._prompt_version_id = cached_prompt.prompt_version_id
280
+ self._type = PromptType(cached_prompt.type)
281
+ self._interpolation_type = PromptInterpolationType(
282
+ cached_prompt.interpolation_type
283
+ )
284
+
285
+ end_time = time.perf_counter()
286
+ time_taken = format(end_time - start_time, ".2f")
287
+ progress.update(
288
+ task_id,
289
+ description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Loaded from cache! ({time_taken}s)",
290
+ )
210
291
 
211
292
  def pull(
212
293
  self,
213
294
  version: Optional[str] = None,
295
+ label: Optional[str] = None,
214
296
  fallback_to_cache: bool = True,
215
297
  write_to_cache: bool = True,
216
298
  default_to_cache: bool = True,
217
299
  refresh: Optional[int] = 60,
218
300
  ):
301
+ should_write_on_first_fetch = False
219
302
  if refresh:
220
303
  default_to_cache = True
221
- write_to_cache = False
304
+ # Check if we need to bootstrap the cache
305
+ cached_prompt = self._read_from_cache(
306
+ self.alias, version=version, label=label
307
+ )
308
+ if cached_prompt is None:
309
+ # No cache exists, so we should write after fetching to bootstrap
310
+ should_write_on_first_fetch = True
311
+ write_to_cache = False # Polling will handle subsequent writes
312
+
222
313
  if self.alias is None:
223
314
  raise TypeError(
224
315
  "Unable to pull prompt from Confident AI when no alias is provided."
@@ -227,15 +318,20 @@ class Prompt:
227
318
  # Manage background prompt polling
228
319
  loop = get_or_create_general_event_loop()
229
320
  if loop.is_running():
230
- loop.create_task(self.create_polling_task(version, refresh))
321
+ loop.create_task(self.create_polling_task(version, label, refresh))
231
322
  else:
232
- loop.run_until_complete(self.create_polling_task(version, refresh))
323
+ loop.run_until_complete(
324
+ self.create_polling_task(version, label, refresh)
325
+ )
233
326
 
234
327
  if default_to_cache:
235
328
  try:
236
- cached_prompt = self._read_from_cache(self.alias, version)
329
+ cached_prompt = self._read_from_cache(
330
+ self.alias, version=version, label=label
331
+ )
237
332
  if cached_prompt:
238
333
  self.version = cached_prompt.version
334
+ self.label = cached_prompt.label
239
335
  self._text_template = cached_prompt.template
240
336
  self._messages_template = cached_prompt.messages_template
241
337
  self._prompt_version_id = cached_prompt.prompt_version_id
@@ -254,58 +350,60 @@ class Prompt:
254
350
  TextColumn("[progress.description]{task.description}"),
255
351
  transient=False,
256
352
  ) as progress:
353
+ HINT_TEXT = (
354
+ f"version='{version or 'latest'}'"
355
+ if not label
356
+ else f"label='{label}'"
357
+ )
257
358
  task_id = progress.add_task(
258
- f"Pulling [rgb(106,0,255)]'{self.alias}' (version='{version or 'latest'}')[/rgb(106,0,255)] from Confident AI...",
359
+ f"Pulling [rgb(106,0,255)]'{self.alias}' ({HINT_TEXT})[/rgb(106,0,255)] from Confident AI...",
259
360
  total=100,
260
361
  )
362
+
261
363
  start_time = time.perf_counter()
262
364
  try:
263
- data, _ = api.send_request(
264
- method=HttpMethods.GET,
265
- endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
266
- url_params={
267
- "alias": self.alias,
268
- "versionId": version or "latest",
269
- },
270
- )
365
+ if label:
366
+ data, _ = api.send_request(
367
+ method=HttpMethods.GET,
368
+ endpoint=Endpoints.PROMPTS_LABEL_ENDPOINT,
369
+ url_params={
370
+ "alias": self.alias,
371
+ "label": label,
372
+ },
373
+ )
374
+ else:
375
+ data, _ = api.send_request(
376
+ method=HttpMethods.GET,
377
+ endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
378
+ url_params={
379
+ "alias": self.alias,
380
+ "versionId": version or "latest",
381
+ },
382
+ )
383
+
271
384
  response = PromptHttpResponse(
272
385
  id=data["id"],
386
+ version=data.get("version", None),
387
+ label=data.get("label", None),
273
388
  text=data.get("text", None),
274
389
  messages=data.get("messages", None),
275
390
  type=data["type"],
276
391
  interpolation_type=data["interpolationType"],
277
392
  )
278
- except:
279
- try:
280
- if fallback_to_cache:
281
- cached_prompt = self._read_from_cache(
282
- self.alias, version
283
- )
284
- if cached_prompt:
285
- self.version = cached_prompt.version
286
- self._text_template = cached_prompt.template
287
- self._messages_template = (
288
- cached_prompt.messages_template
289
- )
290
- self._prompt_version_id = (
291
- cached_prompt.prompt_version_id
292
- )
293
- self._type = PromptType(cached_prompt.type)
294
- self._interpolation_type = PromptInterpolationType(
295
- cached_prompt.interpolation_type
296
- )
297
-
298
- end_time = time.perf_counter()
299
- time_taken = format(end_time - start_time, ".2f")
300
- progress.update(
301
- task_id,
302
- description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Loaded from cache! ({time_taken}s)",
303
- )
304
- return
305
- except:
306
- raise
307
-
308
- self.version = version or "latest"
393
+ except Exception:
394
+ if fallback_to_cache:
395
+ self._load_from_cache_with_progress(
396
+ progress,
397
+ task_id,
398
+ start_time,
399
+ version=version,
400
+ label=label,
401
+ )
402
+ return
403
+ raise
404
+
405
+ self.version = response.version
406
+ self.label = response.label
309
407
  self._text_template = response.text
310
408
  self._messages_template = response.messages
311
409
  self._prompt_version_id = response.id
@@ -318,9 +416,12 @@ class Prompt:
318
416
  task_id,
319
417
  description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Done! ({time_taken}s)",
320
418
  )
321
- if write_to_cache:
419
+ # Write to cache if explicitly requested OR if we need to bootstrap cache for refresh mode
420
+ if write_to_cache or should_write_on_first_fetch:
322
421
  self._write_to_cache(
323
- version=version or "latest",
422
+ cache_key=LABEL_CACHE_KEY if label else VERSION_CACHE_KEY,
423
+ version=response.version,
424
+ label=response.label,
324
425
  text_template=response.text,
325
426
  messages_template=response.messages,
326
427
  prompt_version_id=response.id,
@@ -380,55 +481,114 @@ class Prompt:
380
481
  async def create_polling_task(
381
482
  self,
382
483
  version: Optional[str],
484
+ label: Optional[str],
383
485
  refresh: Optional[int] = 60,
486
+ default_to_cache: bool = True,
384
487
  ):
385
- if version is None:
488
+ if version is None and label is None:
386
489
  return
387
490
 
388
491
  # If polling task doesn't exist, start it
389
- polling_task: Optional[asyncio.Task] = self._polling_tasks.get(version)
492
+ CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
493
+ cache_value = label if label else version
494
+
495
+ # Initialize nested dicts if they don't exist
496
+ if CACHE_KEY not in self._polling_tasks:
497
+ self._polling_tasks[CACHE_KEY] = {}
498
+ if CACHE_KEY not in self._refresh_map:
499
+ self._refresh_map[CACHE_KEY] = {}
500
+
501
+ polling_task: Optional[asyncio.Task] = self._polling_tasks[
502
+ CACHE_KEY
503
+ ].get(cache_value)
504
+
390
505
  if refresh:
391
- self._refresh_map[version] = refresh
506
+ self._refresh_map[CACHE_KEY][cache_value] = refresh
392
507
  if not polling_task:
393
- self._polling_tasks[version] = asyncio.create_task(
394
- self.poll(version)
508
+ self._polling_tasks[CACHE_KEY][cache_value] = (
509
+ asyncio.create_task(
510
+ self.poll(version, label, default_to_cache)
511
+ )
395
512
  )
396
513
 
397
514
  # If invalid `refresh`, stop the task
398
515
  else:
399
516
  if polling_task:
400
517
  polling_task.cancel()
401
- self._polling_tasks.pop(version)
402
- self._refresh_map.pop(version)
518
+ if cache_value in self._polling_tasks[CACHE_KEY]:
519
+ self._polling_tasks[CACHE_KEY].pop(cache_value)
520
+ if cache_value in self._refresh_map[CACHE_KEY]:
521
+ self._refresh_map[CACHE_KEY].pop(cache_value)
403
522
 
404
- async def poll(self, version: Optional[str] = None):
405
- api = Api()
523
+ async def poll(
524
+ self,
525
+ version: Optional[str] = None,
526
+ label: Optional[str] = None,
527
+ default_to_cache: bool = True,
528
+ ):
406
529
  while True:
407
- try:
408
- data, _ = api.send_request(
409
- method=HttpMethods.GET,
410
- endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
411
- url_params={
412
- "alias": self.alias,
413
- "versionId": version or "latest",
414
- },
530
+ if default_to_cache:
531
+ cached_prompt = self._read_from_cache(
532
+ self.alias, version=version, label=label
415
533
  )
534
+ if cached_prompt:
535
+ self.version = cached_prompt.version
536
+ self.label = cached_prompt.label
537
+ self._text_template = cached_prompt.template
538
+ self._messages_template = cached_prompt.messages_template
539
+ self._prompt_version_id = cached_prompt.prompt_version_id
540
+ self._type = PromptType(cached_prompt.type)
541
+ self._interpolation_type = PromptInterpolationType(
542
+ cached_prompt.interpolation_type
543
+ )
544
+ return
545
+
546
+ api = Api()
547
+ try:
548
+ if label:
549
+ data, _ = api.send_request(
550
+ method=HttpMethods.GET,
551
+ endpoint=Endpoints.PROMPTS_LABEL_ENDPOINT,
552
+ url_params={
553
+ "alias": self.alias,
554
+ "label": label,
555
+ },
556
+ )
557
+ else:
558
+ data, _ = api.send_request(
559
+ method=HttpMethods.GET,
560
+ endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
561
+ url_params={
562
+ "alias": self.alias,
563
+ "versionId": version or "latest",
564
+ },
565
+ )
566
+
416
567
  response = PromptHttpResponse(
417
568
  id=data["id"],
569
+ version=data.get("version", None),
570
+ label=data.get("label", None),
418
571
  text=data.get("text", None),
419
572
  messages=data.get("messages", None),
420
573
  type=data["type"],
421
574
  interpolation_type=data["interpolationType"],
422
575
  )
423
- self._write_to_cache(
424
- version=version or "latest",
425
- text_template=response.text,
426
- messages_template=response.messages,
427
- prompt_version_id=response.id,
428
- type=response.type,
429
- interpolation_type=response.interpolation_type,
430
- )
576
+ if default_to_cache:
577
+ self._write_to_cache(
578
+ cache_key=(
579
+ LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
580
+ ),
581
+ version=response.version,
582
+ label=response.label,
583
+ text_template=response.text,
584
+ messages_template=response.messages,
585
+ prompt_version_id=response.id,
586
+ type=response.type,
587
+ interpolation_type=response.interpolation_type,
588
+ )
431
589
  except Exception as e:
432
590
  pass
433
591
 
434
- await asyncio.sleep(self._refresh_map[version])
592
+ CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
593
+ cache_value = label if label else version
594
+ await asyncio.sleep(self._refresh_map[CACHE_KEY][cache_value])
@@ -383,53 +383,70 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
383
383
  # return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
384
384
 
385
385
 
386
+ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
387
+ try:
388
+ raw = span.attributes.get("pydantic_ai.all_messages")
389
+ if not raw:
390
+ return None
391
+
392
+ messages = raw
393
+ if isinstance(messages, str):
394
+ messages = json.loads(messages)
395
+ elif isinstance(messages, tuple):
396
+ messages = list(messages)
397
+
398
+ if isinstance(messages, list):
399
+ normalized = []
400
+ for m in messages:
401
+ if isinstance(m, str):
402
+ try:
403
+ m = json.loads(m)
404
+ except Exception:
405
+ pass
406
+ normalized.append(m)
407
+ return normalized
408
+ except Exception:
409
+ pass
410
+
411
+ return None
412
+
413
+
386
414
  def check_pydantic_ai_agent_input_output(
387
415
  span: ReadableSpan,
388
416
  ) -> Tuple[Optional[Any], Optional[Any]]:
389
417
  input_val: Optional[Any] = None
390
418
  output_val: Optional[Any] = None
391
419
 
420
+ # Get normalized messages once
421
+ normalized = _normalize_pydantic_ai_messages(span)
422
+
392
423
  # Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
393
- try:
394
- raw = span.attributes.get("pydantic_ai.all_messages")
395
- if raw:
396
- messages = raw
397
- if isinstance(messages, str):
398
- messages = json.loads(messages)
399
- elif isinstance(messages, tuple):
400
- messages = list(messages)
401
-
402
- if isinstance(messages, list):
403
- normalized = []
404
- for m in messages:
405
- if isinstance(m, str):
406
- try:
407
- m = json.loads(m)
408
- except Exception:
409
- pass
410
- normalized.append(m)
411
-
412
- first_user_idx = None
413
- for i, m in enumerate(normalized):
414
- role = None
415
- if isinstance(m, dict):
416
- role = m.get("role") or m.get("author")
417
- if role == "user":
418
- first_user_idx = i
419
- break
420
-
421
- input_val = (
422
- normalized
423
- if first_user_idx is None
424
- else normalized[: first_user_idx + 1]
425
- )
426
- except Exception:
427
- pass
424
+ if normalized:
425
+ try:
426
+ first_user_idx = None
427
+ for i, m in enumerate(normalized):
428
+ role = None
429
+ if isinstance(m, dict):
430
+ role = m.get("role") or m.get("author")
431
+ if role == "user":
432
+ first_user_idx = i
433
+ break
434
+
435
+ input_val = (
436
+ normalized
437
+ if first_user_idx is None
438
+ else normalized[: first_user_idx + 1]
439
+ )
440
+ except Exception:
441
+ pass
428
442
 
429
443
  # Output (agent final_result)
430
444
  try:
431
445
  if span.attributes.get("confident.span.type") == "agent":
432
446
  output_val = span.attributes.get("final_result")
447
+ if not output_val and normalized:
448
+ # Extract the last message if no final_result is available
449
+ output_val = normalized[-1]
433
450
  except Exception:
434
451
  pass
435
452
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.6.2
3
+ Version: 3.6.3
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
2
- deepeval/_version.py,sha256=3BMVt8jAt3lUkzkZWaFVDhhP9a-3lhvDGzjhGKNfjCo,27
2
+ deepeval/_version.py,sha256=1BsEnmEpD1mtVjCYoXBeguVgrKPAi3TRpS_a7ndu4XU,27
3
3
  deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
4
4
  deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
5
5
  deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -138,7 +138,7 @@ deepeval/cli/test.py,sha256=kSIFMRTAfVzBJ4OitwvT829-ylV7UzPMP57P2DePS-Q,5482
138
138
  deepeval/cli/types.py,sha256=_7KdthstHNc-JKCWrfpDQCf_j8h9PMxh0qJCHmVXJr0,310
139
139
  deepeval/cli/utils.py,sha256=F4-yuONzk4ojDoSLjI9RYERB7HOD412iZ2lNlSCq4wk,5601
140
140
  deepeval/confident/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
141
- deepeval/confident/api.py,sha256=bOC71TaVAEgoXFtJ9yMo0-atmUUdBuvaclMGczMcR6o,8455
141
+ deepeval/confident/api.py,sha256=2ZhrQOtfxcnQSyY6OxrjY17y1yn-NB7pfIiJa20B1Pk,8519
142
142
  deepeval/confident/types.py,sha256=-slFhDof_1maMgpLxqDRZv6kz6ZVY2hP_0uj_aveJKU,533
143
143
  deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
144
  deepeval/config/settings.py,sha256=gRRi6nXEUKse13xAShU9MA18zo14vpIgl_R0xJ_0vnM,21314
@@ -240,7 +240,7 @@ deepeval/metrics/faithfulness/faithfulness.py,sha256=bYVhHI7Tr7xH0x-7F2LijxRuCCE
240
240
  deepeval/metrics/faithfulness/schema.py,sha256=2dU9dwwmqpGJcWvY2webERWIfH_tn02xgLghHkAY_eM,437
241
241
  deepeval/metrics/faithfulness/template.py,sha256=RuZ0LFm4BjZ8lhVrKPgU3ecHszwkF0fe5-BxAkaP5AA,5839
242
242
  deepeval/metrics/g_eval/__init__.py,sha256=HAhsQFVq9LIpZXPN00Jc_WrMXrh47NIT86VnUpWM4_4,102
243
- deepeval/metrics/g_eval/g_eval.py,sha256=JI3rTaEClYgiL9oLaVFh7sunqGoXI7qBeBgi9RkSwDs,14327
243
+ deepeval/metrics/g_eval/g_eval.py,sha256=CaW7VHPW-SyXt18IE1rSatgagY238s3It-j6SLRI4H4,14395
244
244
  deepeval/metrics/g_eval/schema.py,sha256=V629txuDrr_2IEKEsgJVYYZb_pkdfcltQV9ZjvxK5co,287
245
245
  deepeval/metrics/g_eval/template.py,sha256=mHj4-mr_HQwbCjpHg7lM_6UesoSatL3g8UGGQAOdT0U,4509
246
246
  deepeval/metrics/g_eval/utils.py,sha256=uUT86jRXVYvLDzcnZvvfWssDyGoBHb66nWcJSg4i1u4,8784
@@ -348,7 +348,7 @@ deepeval/metrics/task_completion/schema.py,sha256=JfnZkbCh7skWvrESy65GEo6Rvo0FDJ
348
348
  deepeval/metrics/task_completion/task_completion.py,sha256=RKFkXCVOhO70I8A16zv5BCaV3QVKldNxawJ0T93U_Zc,8978
349
349
  deepeval/metrics/task_completion/template.py,sha256=4xjTBcGrPQxInbf8iwJOZyok9SQex1aCkbxKmfkXoA4,10437
350
350
  deepeval/metrics/tool_correctness/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
- deepeval/metrics/tool_correctness/tool_correctness.py,sha256=4dS8o5pD2o9W2uDb-lFgulHpLI5kFhAlguWlffIreUU,11993
351
+ deepeval/metrics/tool_correctness/tool_correctness.py,sha256=j5wB9mJp7BLbn3bTZd6LlIeub1kXxXGaDVWrzyvBFo4,12111
352
352
  deepeval/metrics/toxicity/__init__.py,sha256=1lgt8BKxfBDd7bfSLu_5kMzmsr9b2_ahPK9oq5zLkMk,39
353
353
  deepeval/metrics/toxicity/schema.py,sha256=7uUdzXqTvIIz5nfahlllo_fzVRXg7UeMeXn7Hl32pKY,459
354
354
  deepeval/metrics/toxicity/template.py,sha256=zl4y4Tg9gXkxKJ8aXVwj0cJ94pvfYuP7MTeV3dvB5yQ,5045
@@ -370,7 +370,7 @@ deepeval/models/embedding_models/ollama_embedding_model.py,sha256=w3etdIdWvYfVIE
370
370
  deepeval/models/embedding_models/openai_embedding_model.py,sha256=Z1--e3CnNNmwryqmUMxBCaTURjtgKWHqADuUeCqFlSc,3545
371
371
  deepeval/models/hallucination_model.py,sha256=ABi978VKLE_jNHbDzM96kJ08EsZ5ZlvOlJHA_ptSkfQ,1003
372
372
  deepeval/models/llms/__init__.py,sha256=qmvv7wnmTDvys2uUTwQRo-_3DlFV3fGLiewPeQYRsAI,670
373
- deepeval/models/llms/amazon_bedrock_model.py,sha256=xaNV7BnqcsH31ghIKBcacKzetORlFRGHtuBlfr8vbnQ,6183
373
+ deepeval/models/llms/amazon_bedrock_model.py,sha256=3yiUUGU_d_YK7Usq8v5iqG3yHa5VnqeDOoCLG_p8rtc,5185
374
374
  deepeval/models/llms/anthropic_model.py,sha256=5gYRNkYUD7Zl3U0SibBG2YGCQsD6DdTsaBhqdaJlKIw,6072
375
375
  deepeval/models/llms/azure_model.py,sha256=dqINcfoJNqdd9zh5iTPwQ_ToGMOF7iH6YUB-UWRSOlc,10730
376
376
  deepeval/models/llms/deepseek_model.py,sha256=EqBJkKa7rXppCmlnIt_D-Z_r9fbsOUsOAVvN2jWA-Hk,6404
@@ -380,8 +380,8 @@ deepeval/models/llms/kimi_model.py,sha256=ldTefdSVitZYJJQ-_ZsP87iiT5iZ4QCVdfi-Yz
380
380
  deepeval/models/llms/litellm_model.py,sha256=iu4-_JCpd9LdEa-eCWseD2iLTA-r7OSgYGWQ0IxB4eA,11527
381
381
  deepeval/models/llms/local_model.py,sha256=hEyKVA6pkQm9dICUKsMNgjVI3w6gnyMdmBt_EylkWDk,4473
382
382
  deepeval/models/llms/ollama_model.py,sha256=xPO4d4jMY-cQAyHAcMuFvWS8JMWwCUbKP9CMi838Nuc,3307
383
- deepeval/models/llms/openai_model.py,sha256=F02N8BgbiEXH7F6y-a6DkjVcBXFEzr87SEB2gVn4xlU,17192
384
- deepeval/models/llms/utils.py,sha256=ZMZ02kjXAAleq0bIEyjj-gZwe6Gp0b0mK8YMuid2-20,722
383
+ deepeval/models/llms/openai_model.py,sha256=mUvQ8a9FVk4lrdZyS_QRZTK4imufyaCNjZFPeqbc0AM,17167
384
+ deepeval/models/llms/utils.py,sha256=gFM_8eIvdSwN_D4Yqp-j7PkfoiRn_bgu7tlCHol3A6c,1324
385
385
  deepeval/models/mlllms/__init__.py,sha256=19nN6kUB5XI0nUWUQX0aD9GBUMM8WWGvsDgKjuT4EF4,144
386
386
  deepeval/models/mlllms/gemini_model.py,sha256=7tHIWD4w_fBz3L7jkKWygn1QpBPk9nl2Kw-yb0Jc3PI,10167
387
387
  deepeval/models/mlllms/ollama_model.py,sha256=_YtYtw8oIMVVI-CFsDicsdeEJUPhw_9ArPxB_1olsJA,4798
@@ -404,8 +404,8 @@ deepeval/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
404
404
  deepeval/plugins/plugin.py,sha256=_dwsdx4Dg9DbXxK3f7zJY4QWTJQWc7QE1HmIg2Zjjag,1515
405
405
  deepeval/progress_context.py,sha256=ZSKpxrE9sdgt9G3REKnVeXAv7GJXHHVGgLynpG1Pudw,3557
406
406
  deepeval/prompt/__init__.py,sha256=M99QTWdxOfiNeySGCSqN873Q80PPxqRvjLq4_Mw-X1w,49
407
- deepeval/prompt/api.py,sha256=kR3MkaHuU2wYILKVnvnXhQWxWp0XgtcWX-kIjpMJRl8,1728
408
- deepeval/prompt/prompt.py,sha256=192W5zFBx08nELxRHHDQscMM3psj8OUFV_JR85BZv8Q,15823
407
+ deepeval/prompt/api.py,sha256=665mLKiq8irXWV8kM9P_qFJipdCYZUNQFwW8AkA3itM,1777
408
+ deepeval/prompt/prompt.py,sha256=w2BmKtSzXxobjSlBQqUjdAB0Zwe6IYaLjLg7KQvVDXE,21999
409
409
  deepeval/prompt/utils.py,sha256=Ermw9P-1-T5wQ5uYuj5yWgdj7pVB_JLw8D37Qvmh9ok,1938
410
410
  deepeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
411
411
  deepeval/red_teaming/README.md,sha256=BY5rAdpp3-sMMToEKwq0Nsd9ivkGDzPE16DeDb8GY7U,154
@@ -454,15 +454,15 @@ deepeval/tracing/offline_evals/thread.py,sha256=bcSGFcZJKnszArOLIlWvnCyt0zSmsd7X
454
454
  deepeval/tracing/offline_evals/trace.py,sha256=vTflaTKysKRiYvKA-Nx6PUJ3J6NrRLXiIdWieVcm90E,1868
455
455
  deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx3HEk,88
456
456
  deepeval/tracing/otel/exporter.py,sha256=wPO1ITKpjueLOSNLO6nD2QL9LAd8Xcu6en8hRkB61Wo,28891
457
- deepeval/tracing/otel/utils.py,sha256=4FqCwOi-iYhuQ3GhAkbbmXbfhvSLGj9DAdfPCrUIccs,14738
457
+ deepeval/tracing/otel/utils.py,sha256=yAXyPvTjax2HdLcvbVv9pyOVW4S7elIp3RLGuBTr_8o,15113
458
458
  deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
459
459
  deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
460
460
  deepeval/tracing/tracing.py,sha256=xZEyuxdGY259nQaDkGp_qO7Avriv8hrf4L15ZfeMNV8,42728
461
461
  deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
462
462
  deepeval/tracing/utils.py,sha256=SLnks8apGlrV6uVnvFVl2mWYABEkvXbPXnQvq3KaU_o,7943
463
463
  deepeval/utils.py,sha256=-_o3W892u7naX4Y7a8if4mP0Rtkgtapg6Krr1ZBpj0o,17197
464
- deepeval-3.6.2.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
- deepeval-3.6.2.dist-info/METADATA,sha256=TZ7FbJUYYZ1w2P-qmLZdIHB0zv4TnZ4VeLBgN9Bq6Yo,18754
466
- deepeval-3.6.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
- deepeval-3.6.2.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
- deepeval-3.6.2.dist-info/RECORD,,
464
+ deepeval-3.6.3.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
+ deepeval-3.6.3.dist-info/METADATA,sha256=BoRZ6BEBPwkypse9Xzw8gRlsezwSrDKsT5RO9C3thQc,18754
466
+ deepeval-3.6.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
+ deepeval-3.6.3.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
+ deepeval-3.6.3.dist-info/RECORD,,