deepeval 3.6.1__py3-none-any.whl → 3.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.1"
1
+ __version__: str = "3.6.2"
@@ -836,7 +836,13 @@ def execute_agentic_test_cases(
836
836
  ):
837
837
  if asyncio.iscoroutinefunction(observed_callback):
838
838
  loop = get_or_create_event_loop()
839
- loop.run_until_complete(observed_callback(golden.input))
839
+ coro = observed_callback(golden.input)
840
+ loop.run_until_complete(
841
+ asyncio.wait_for(
842
+ coro,
843
+ timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
844
+ )
845
+ )
840
846
  else:
841
847
  observed_callback(golden.input)
842
848
  current_trace: Trace = current_trace_context.get()
@@ -1190,7 +1196,10 @@ async def _a_execute_agentic_test_case(
1190
1196
  _pbar_callback_id=pbar_tags_id,
1191
1197
  ):
1192
1198
  if asyncio.iscoroutinefunction(observed_callback):
1193
- await observed_callback(golden.input)
1199
+ await asyncio.wait_for(
1200
+ observed_callback(golden.input),
1201
+ timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
1202
+ )
1194
1203
  else:
1195
1204
  observed_callback(golden.input)
1196
1205
  current_trace: Trace = current_trace_context.get()
@@ -30,7 +30,7 @@ class HallucinationMetric(BaseMetric):
30
30
  threshold: float = 0.5,
31
31
  model: Optional[Union[str, DeepEvalBaseLLM]] = None,
32
32
  include_reason: bool = True,
33
- async_mode: bool = False,
33
+ async_mode: bool = True,
34
34
  strict_mode: bool = False,
35
35
  verbose_mode: bool = False,
36
36
  evaluation_template: Type[
@@ -152,19 +152,14 @@ class ToolCorrectnessMetric(BaseMetric):
152
152
 
153
153
  # Calculate score
154
154
  def _calculate_score(self):
155
- if self.should_exact_match:
155
+ # Fix: handle empty expected_tools to avoid ZeroDivisionError
156
+ if len(self.expected_tools) == 0:
157
+ score = 1.0 if len(self.tools_called) == 0 else 0.0
158
+ elif self.should_exact_match:
156
159
  score = self._calculate_exact_match_score()
157
160
  elif self.should_consider_ordering:
158
161
  _, weighted_length = self._compute_weighted_lcs()
159
- if (
160
- len(self.tools_called) == len(self.expected_tools)
161
- and len(self.expected_tools) == 0
162
- ):
163
- score = 1.0
164
- elif len(self.expected_tools) == 0:
165
- score = 0.0
166
- else:
167
- score = weighted_length / len(self.expected_tools)
162
+ score = weighted_length / len(self.expected_tools)
168
163
  else:
169
164
  score = self._calculate_non_exact_match_score()
170
165
  return 0 if self.strict_mode and score < self.threshold else score
@@ -1,4 +1,5 @@
1
1
  from pydantic import (
2
+ ConfigDict,
2
3
  Field,
3
4
  BaseModel,
4
5
  model_validator,
@@ -151,6 +152,8 @@ class ToolCall(BaseModel):
151
152
 
152
153
 
153
154
  class LLMTestCase(BaseModel):
155
+ model_config = ConfigDict(extra="ignore")
156
+
154
157
  input: str
155
158
  actual_output: Optional[str] = Field(
156
159
  default=None,
@@ -1,11 +1,16 @@
1
+ import json
2
+
1
3
  from typing import List, Optional, Tuple, Any
2
- from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
3
- from deepeval.tracing import trace_manager, BaseSpan
4
4
  from opentelemetry.sdk.trace.export import ReadableSpan
5
- import json
6
5
 
6
+ from deepeval.evaluate.utils import create_api_test_case
7
+ from deepeval.test_run.api import LLMApiTestCase
8
+ from deepeval.test_run.test_run import global_test_run_manager
9
+ from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
10
+ from deepeval.tracing import trace_manager, BaseSpan
7
11
  from deepeval.tracing.utils import make_json_serializable
8
12
 
13
+
9
14
  GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "task_completion"]
10
15
 
11
16
 
@@ -107,12 +112,12 @@ def check_llm_input_from_gen_ai_attributes(
107
112
  input = json.loads(span.attributes.get("gen_ai.input.messages"))
108
113
  input = _flatten_input(input)
109
114
 
110
- except Exception as e:
115
+ except Exception:
111
116
  pass
112
117
  try:
113
118
  output = json.loads(span.attributes.get("gen_ai.output.messages"))
114
119
  output = _flatten_input(output)
115
- except Exception as e:
120
+ except Exception:
116
121
  pass
117
122
 
118
123
  if input is None and output is None:
@@ -126,7 +131,7 @@ def check_llm_input_from_gen_ai_attributes(
126
131
  and last_event.get("event.name") == "gen_ai.choice"
127
132
  ):
128
133
  output = last_event
129
- except Exception as e:
134
+ except Exception:
130
135
  pass
131
136
 
132
137
  return input, output
@@ -181,7 +186,7 @@ def _flatten_input(input: list) -> list:
181
186
  }
182
187
  )
183
188
  return result
184
- except Exception as e:
189
+ except Exception:
185
190
  return input
186
191
 
187
192
  return input
@@ -192,7 +197,7 @@ def check_tool_name_from_gen_ai_attributes(span: ReadableSpan) -> Optional[str]:
192
197
  gen_ai_tool_name = span.attributes.get("gen_ai.tool.name")
193
198
  if gen_ai_tool_name:
194
199
  return gen_ai_tool_name
195
- except Exception as e:
200
+ except Exception:
196
201
  pass
197
202
 
198
203
  return None
@@ -205,7 +210,7 @@ def check_tool_input_parameters_from_gen_ai_attributes(
205
210
  tool_arguments = span.attributes.get("tool_arguments")
206
211
  if tool_arguments:
207
212
  return json.loads(tool_arguments)
208
- except Exception as e:
213
+ except Exception:
209
214
  pass
210
215
 
211
216
  return None
@@ -224,7 +229,7 @@ def check_span_type_from_gen_ai_attributes(span: ReadableSpan):
224
229
 
225
230
  elif gen_ai_tool_name:
226
231
  return "tool"
227
- except Exception as e:
232
+ except Exception:
228
233
  pass
229
234
 
230
235
  return "base"
@@ -235,7 +240,7 @@ def check_model_from_gen_ai_attributes(span: ReadableSpan):
235
240
  gen_ai_request_model_name = span.attributes.get("gen_ai.request.model")
236
241
  if gen_ai_request_model_name:
237
242
  return gen_ai_request_model_name
238
- except Exception as e:
243
+ except Exception:
239
244
  pass
240
245
 
241
246
  return None
@@ -286,7 +291,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
286
291
  tools_called.append(
287
292
  ToolCall.model_validate_json(tool_call_json_str)
288
293
  )
289
- except Exception as e:
294
+ except Exception:
290
295
  pass
291
296
 
292
297
  _expected_tools = span.attributes.get(
@@ -299,7 +304,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
299
304
  expected_tools.append(
300
305
  ToolCall.model_validate_json(tool_call_json_str)
301
306
  )
302
- except Exception as e:
307
+ except Exception:
303
308
  pass
304
309
 
305
310
  test_case.tools_called = tools_called
@@ -328,12 +333,6 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
328
333
  return parsed_context
329
334
 
330
335
 
331
- from deepeval.evaluate.utils import create_api_test_case
332
- from deepeval.test_run.api import LLMApiTestCase
333
- from deepeval.test_run.test_run import global_test_run_manager
334
- from typing import Optional
335
-
336
-
337
336
  def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
338
337
  # Accept single trace or list of traces
339
338
  if isinstance(traces, Trace):
@@ -442,7 +441,7 @@ def check_pydantic_ai_agent_input_output(
442
441
  def check_tool_output(span: ReadableSpan):
443
442
  try:
444
443
  return span.attributes.get("tool_response")
445
- except Exception as e:
444
+ except Exception:
446
445
  pass
447
446
  return None
448
447
 
@@ -208,7 +208,13 @@ class TraceManager:
208
208
  else:
209
209
  # print(f"Ending trace: {trace.root_spans}")
210
210
  self.environment = Environment.TESTING
211
- trace.root_spans = [trace.root_spans[0].children[0]]
211
+ if (
212
+ trace.root_spans
213
+ and len(trace.root_spans) > 0
214
+ and trace.root_spans[0].children
215
+ and len(trace.root_spans[0].children) > 0
216
+ ):
217
+ trace.root_spans = [trace.root_spans[0].children[0]]
212
218
  for root_span in trace.root_spans:
213
219
  root_span.parent_uuid = None
214
220
 
deepeval/tracing/utils.py CHANGED
@@ -1,15 +1,12 @@
1
1
  import os
2
- import time
3
2
  import inspect
4
3
  import json
5
4
  import sys
6
- import difflib
7
5
  from datetime import datetime, timezone
8
6
  from enum import Enum
9
7
  from time import perf_counter
10
- import time
11
8
  from collections import deque
12
- from typing import Any, Dict, Optional, Sequence, Callable
9
+ from typing import Any, Dict, Optional
13
10
 
14
11
  from deepeval.constants import CONFIDENT_TRACING_ENABLED
15
12
 
@@ -189,8 +186,8 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
189
186
  def replace_self_with_class_name(obj):
190
187
  try:
191
188
  return f"<{obj.__class__.__name__}>"
192
- except:
193
- return f"<self>"
189
+ except Exception:
190
+ return "<self>"
194
191
 
195
192
 
196
193
  def get_deepeval_trace_mode() -> Optional[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.6.1
3
+ Version: 3.6.2
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -359,7 +359,7 @@ for golden in dataset.goldens:
359
359
 
360
360
  @pytest.mark.parametrize(
361
361
  "test_case",
362
- dataset,
362
+ dataset.test_cases,
363
363
  )
364
364
  def test_customer_chatbot(test_case: LLMTestCase):
365
365
  answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
@@ -1,5 +1,5 @@
1
1
  deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
2
- deepeval/_version.py,sha256=60ky4ZrqXl83ooFzPWUHtPFcXD1XP6b9GQDnqw3EHOU,27
2
+ deepeval/_version.py,sha256=3BMVt8jAt3lUkzkZWaFVDhhP9a-3lhvDGzjhGKNfjCo,27
3
3
  deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
4
4
  deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
5
5
  deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -159,7 +159,7 @@ deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
159
159
  deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
160
160
  deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
161
161
  deepeval/evaluate/evaluate.py,sha256=NPAJ2iJqJI_RurXKUIC0tft_ozYMIKwZf5iPfmnNhQc,10412
162
- deepeval/evaluate/execute.py,sha256=7RCjn2GGcjqK6cp9-0BtHL6PPJNw5-KXqXL60GN3G5Y,88672
162
+ deepeval/evaluate/execute.py,sha256=XS0XtDGKC1ZOo09lthillfi5aDI5TWFbJ-Y7yICNvGo,89056
163
163
  deepeval/evaluate/types.py,sha256=IGZ3Xsj0UecPI3JNeTpJaK1gDvlepokfCmHwtItIW9M,831
164
164
  deepeval/evaluate/utils.py,sha256=kkliSGzuICeUsXDtlMMPfN95dUKlqarNhfciSffd4gI,23143
165
165
  deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -245,7 +245,7 @@ deepeval/metrics/g_eval/schema.py,sha256=V629txuDrr_2IEKEsgJVYYZb_pkdfcltQV9Zjvx
245
245
  deepeval/metrics/g_eval/template.py,sha256=mHj4-mr_HQwbCjpHg7lM_6UesoSatL3g8UGGQAOdT0U,4509
246
246
  deepeval/metrics/g_eval/utils.py,sha256=uUT86jRXVYvLDzcnZvvfWssDyGoBHb66nWcJSg4i1u4,8784
247
247
  deepeval/metrics/hallucination/__init__.py,sha256=rCVlHi2UGzDKmZKi0esFLafmshVBx2WZ0jiIb-KqcYQ,44
248
- deepeval/metrics/hallucination/hallucination.py,sha256=tozck1KwqDv80Nd449QH6_6mG15768eCGxmjoBsbzKw,9549
248
+ deepeval/metrics/hallucination/hallucination.py,sha256=8JN5pj5YWRtl7rgbbFQF6EVBCGm1NV9vaX3_5tScNs4,9548
249
249
  deepeval/metrics/hallucination/schema.py,sha256=V8xbrBLMwJfre-lPuDc7rMEdhHf_1hfgoW1jE_ULvAY,286
250
250
  deepeval/metrics/hallucination/template.py,sha256=hiss1soxSBFqzOt0KmHZdZUzoQsmXnslDyb8HsjALPs,2620
251
251
  deepeval/metrics/indicator.py,sha256=oewo_n5Qet9Zfzo2QQs-EQ8w92siuyDCAmoTZW45ndc,10244
@@ -348,7 +348,7 @@ deepeval/metrics/task_completion/schema.py,sha256=JfnZkbCh7skWvrESy65GEo6Rvo0FDJ
348
348
  deepeval/metrics/task_completion/task_completion.py,sha256=RKFkXCVOhO70I8A16zv5BCaV3QVKldNxawJ0T93U_Zc,8978
349
349
  deepeval/metrics/task_completion/template.py,sha256=4xjTBcGrPQxInbf8iwJOZyok9SQex1aCkbxKmfkXoA4,10437
350
350
  deepeval/metrics/tool_correctness/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
- deepeval/metrics/tool_correctness/tool_correctness.py,sha256=8uyNFGM_aGFAB2aCv2CVDg5cjj0OJe8UVDqaT3Gp3kU,12090
351
+ deepeval/metrics/tool_correctness/tool_correctness.py,sha256=4dS8o5pD2o9W2uDb-lFgulHpLI5kFhAlguWlffIreUU,11993
352
352
  deepeval/metrics/toxicity/__init__.py,sha256=1lgt8BKxfBDd7bfSLu_5kMzmsr9b2_ahPK9oq5zLkMk,39
353
353
  deepeval/metrics/toxicity/schema.py,sha256=7uUdzXqTvIIz5nfahlllo_fzVRXg7UeMeXn7Hl32pKY,459
354
354
  deepeval/metrics/toxicity/template.py,sha256=zl4y4Tg9gXkxKJ8aXVwj0cJ94pvfYuP7MTeV3dvB5yQ,5045
@@ -434,7 +434,7 @@ deepeval/telemetry.py,sha256=JPZw1VBJ5dGiS8k-dzWs5OhMbNlr65QgVretTy33WCg,21704
434
434
  deepeval/test_case/__init__.py,sha256=hLkHxGH0-FFhx4MlJwIbzNHL4pgyLGquh8l0qD-z_cQ,731
435
435
  deepeval/test_case/arena_test_case.py,sha256=PcfDxadlc4yW4AEDdvN32AeUpx2Sms1jvnbX31Xu65o,957
436
436
  deepeval/test_case/conversational_test_case.py,sha256=lF0V1yCGCInQetggm2wbXx-MkuMRs2ScwqIXCSwb1Fs,7534
437
- deepeval/test_case/llm_test_case.py,sha256=uWipuFVzKR3gYSpAbjK6GB_6XdtDMIRDNms-LyZYsuc,12117
437
+ deepeval/test_case/llm_test_case.py,sha256=L-dCvJ4pMPPavZTyN9ZKN30h351DWI_TunmXfHPIjig,12180
438
438
  deepeval/test_case/mcp.py,sha256=Z625NLvz0E_UJpbyfyuAi_4nsqKH6DByBf0rfKd70xU,1879
439
439
  deepeval/test_case/mllm_test_case.py,sha256=8a0YoE72geX_fLI6yk_cObSxCPddwW-DOb-5OPE1-W8,5414
440
440
  deepeval/test_case/utils.py,sha256=5lT7QmhItsQHt44-qQfspuktilcrEyvl2cS0cgUJxds,809
@@ -454,15 +454,15 @@ deepeval/tracing/offline_evals/thread.py,sha256=bcSGFcZJKnszArOLIlWvnCyt0zSmsd7X
454
454
  deepeval/tracing/offline_evals/trace.py,sha256=vTflaTKysKRiYvKA-Nx6PUJ3J6NrRLXiIdWieVcm90E,1868
455
455
  deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx3HEk,88
456
456
  deepeval/tracing/otel/exporter.py,sha256=wPO1ITKpjueLOSNLO6nD2QL9LAd8Xcu6en8hRkB61Wo,28891
457
- deepeval/tracing/otel/utils.py,sha256=THXOoqLau4w6Jlz0YJV3K3vQcVptxo14hcDQCJiPeks,14821
457
+ deepeval/tracing/otel/utils.py,sha256=4FqCwOi-iYhuQ3GhAkbbmXbfhvSLGj9DAdfPCrUIccs,14738
458
458
  deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
459
459
  deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
460
- deepeval/tracing/tracing.py,sha256=WFXfGLt58Ia9yCohDZBIUGX6mwieoF8489UziuC-NJI,42458
460
+ deepeval/tracing/tracing.py,sha256=xZEyuxdGY259nQaDkGp_qO7Avriv8hrf4L15ZfeMNV8,42728
461
461
  deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
462
- deepeval/tracing/utils.py,sha256=RUcsDpS_aobK3zuNfZGNvjk7aBbBfHOj3aYu2hRZzg0,7993
462
+ deepeval/tracing/utils.py,sha256=SLnks8apGlrV6uVnvFVl2mWYABEkvXbPXnQvq3KaU_o,7943
463
463
  deepeval/utils.py,sha256=-_o3W892u7naX4Y7a8if4mP0Rtkgtapg6Krr1ZBpj0o,17197
464
- deepeval-3.6.1.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
- deepeval-3.6.1.dist-info/METADATA,sha256=UrYM0bqzIvhmMlevcqO-Hcbbm2e5r26FwWEzz2rKua8,18743
466
- deepeval-3.6.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
- deepeval-3.6.1.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
- deepeval-3.6.1.dist-info/RECORD,,
464
+ deepeval-3.6.2.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
+ deepeval-3.6.2.dist-info/METADATA,sha256=TZ7FbJUYYZ1w2P-qmLZdIHB0zv4TnZ4VeLBgN9Bq6Yo,18754
466
+ deepeval-3.6.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
+ deepeval-3.6.2.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
+ deepeval-3.6.2.dist-info/RECORD,,