PyPI - deepeval - Versions diffs - 3.6.1__py3-none-any.whl → 3.6.2__py3-none-any.whl - Mend

deepeval 3.6.1py3-none-any.whl → 3.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

deepeval/_version.py +1 -1
deepeval/evaluate/execute.py +11 -2
deepeval/metrics/hallucination/hallucination.py +1 -1
deepeval/metrics/tool_correctness/tool_correctness.py +5 -10
deepeval/test_case/llm_test_case.py +3 -0
deepeval/tracing/otel/utils.py +19 -20
deepeval/tracing/tracing.py +7 -1
deepeval/tracing/utils.py +3 -6
{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/METADATA +2 -2
{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/RECORD +13 -13
{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/WHEEL +0 -0
{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/entry_points.txt +0 -0

deepeval/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "3.6.1"
1	+ __version__: str = "3.6.2"

deepeval/evaluate/execute.py CHANGED Viewed

@@ -836,7 +836,13 @@ def execute_agentic_test_cases(
                 ):
                     if asyncio.iscoroutinefunction(observed_callback):
                         loop = get_or_create_event_loop()
-                        loop.run_until_complete(observed_callback(golden.input))
+                        coro = observed_callback(golden.input)
+                        loop.run_until_complete(
+                            asyncio.wait_for(
+                                coro,
+                                timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                            )
+                        )
                     else:
                         observed_callback(golden.input)
                     current_trace: Trace = current_trace_context.get()
@@ -1190,7 +1196,10 @@ async def _a_execute_agentic_test_case(
             _pbar_callback_id=pbar_tags_id,
         ):
             if asyncio.iscoroutinefunction(observed_callback):
-                await observed_callback(golden.input)
+                await asyncio.wait_for(
+                    observed_callback(golden.input),
+                    timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
+                )
             else:
                 observed_callback(golden.input)
             current_trace: Trace = current_trace_context.get()

deepeval/metrics/hallucination/hallucination.py CHANGED Viewed

@@ -30,7 +30,7 @@ class HallucinationMetric(BaseMetric):
         threshold: float = 0.5,
         model: Optional[Union[str, DeepEvalBaseLLM]] = None,
         include_reason: bool = True,
-        async_mode: bool = False,
+        async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
         evaluation_template: Type[

deepeval/metrics/tool_correctness/tool_correctness.py CHANGED Viewed

@@ -152,19 +152,14 @@ class ToolCorrectnessMetric(BaseMetric):
     # Calculate score
     def _calculate_score(self):
-        if self.should_exact_match:
+        # Fix: handle empty expected_tools to avoid ZeroDivisionError
+        if len(self.expected_tools) == 0:
+            score = 1.0 if len(self.tools_called) == 0 else 0.0
+        elif self.should_exact_match:
             score = self._calculate_exact_match_score()
         elif self.should_consider_ordering:
             _, weighted_length = self._compute_weighted_lcs()
-            if (
-                len(self.tools_called) == len(self.expected_tools)
-                and len(self.expected_tools) == 0
-            ):
-                score = 1.0
-            elif len(self.expected_tools) == 0:
-                score = 0.0
-            else:
-                score = weighted_length / len(self.expected_tools)
+            score = weighted_length / len(self.expected_tools)
         else:
             score = self._calculate_non_exact_match_score()
         return 0 if self.strict_mode and score < self.threshold else score

deepeval/test_case/llm_test_case.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from pydantic import (
+    ConfigDict,
     Field,
     BaseModel,
     model_validator,
@@ -151,6 +152,8 @@ class ToolCall(BaseModel):
 class LLMTestCase(BaseModel):
+    model_config = ConfigDict(extra="ignore")
     input: str
     actual_output: Optional[str] = Field(
         default=None,

deepeval/tracing/otel/utils.py CHANGED Viewed

@@ -1,11 +1,16 @@
+import json
 from typing import List, Optional, Tuple, Any
-from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
-from deepeval.tracing import trace_manager, BaseSpan
 from opentelemetry.sdk.trace.export import ReadableSpan
-import json
+from deepeval.evaluate.utils import create_api_test_case
+from deepeval.test_run.api import LLMApiTestCase
+from deepeval.test_run.test_run import global_test_run_manager
+from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
+from deepeval.tracing import trace_manager, BaseSpan
 from deepeval.tracing.utils import make_json_serializable
 GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "task_completion"]
@@ -107,12 +112,12 @@ def check_llm_input_from_gen_ai_attributes(
         input = json.loads(span.attributes.get("gen_ai.input.messages"))
         input = _flatten_input(input)
-    except Exception as e:
+    except Exception:
         pass
     try:
         output = json.loads(span.attributes.get("gen_ai.output.messages"))
         output = _flatten_input(output)
-    except Exception as e:
+    except Exception:
         pass
     if input is None and output is None:
@@ -126,7 +131,7 @@ def check_llm_input_from_gen_ai_attributes(
                     and last_event.get("event.name") == "gen_ai.choice"
                 ):
                     output = last_event
-        except Exception as e:
+        except Exception:
             pass
     return input, output
@@ -181,7 +186,7 @@ def _flatten_input(input: list) -> list:
                         }
                     )
             return result
-        except Exception as e:
+        except Exception:
             return input
     return input
@@ -192,7 +197,7 @@ def check_tool_name_from_gen_ai_attributes(span: ReadableSpan) -> Optional[str]:
         gen_ai_tool_name = span.attributes.get("gen_ai.tool.name")
         if gen_ai_tool_name:
             return gen_ai_tool_name
-    except Exception as e:
+    except Exception:
         pass
     return None
@@ -205,7 +210,7 @@ def check_tool_input_parameters_from_gen_ai_attributes(
         tool_arguments = span.attributes.get("tool_arguments")
         if tool_arguments:
             return json.loads(tool_arguments)
-    except Exception as e:
+    except Exception:
         pass
     return None
@@ -224,7 +229,7 @@ def check_span_type_from_gen_ai_attributes(span: ReadableSpan):
         elif gen_ai_tool_name:
             return "tool"
-    except Exception as e:
+    except Exception:
         pass
     return "base"
@@ -235,7 +240,7 @@ def check_model_from_gen_ai_attributes(span: ReadableSpan):
         gen_ai_request_model_name = span.attributes.get("gen_ai.request.model")
         if gen_ai_request_model_name:
             return gen_ai_request_model_name
-    except Exception as e:
+    except Exception:
         pass
     return None
@@ -286,7 +291,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
                     tools_called.append(
                         ToolCall.model_validate_json(tool_call_json_str)
                     )
-                except Exception as e:
+                except Exception:
                     pass
     _expected_tools = span.attributes.get(
@@ -299,7 +304,7 @@ def prepare_trace_llm_test_case(span: ReadableSpan) -> Optional[LLMTestCase]:
                     expected_tools.append(
                         ToolCall.model_validate_json(tool_call_json_str)
                     )
-                except Exception as e:
+                except Exception:
                     pass
     test_case.tools_called = tools_called
@@ -328,12 +333,6 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
     return parsed_context
-from deepeval.evaluate.utils import create_api_test_case
-from deepeval.test_run.api import LLMApiTestCase
-from deepeval.test_run.test_run import global_test_run_manager
-from typing import Optional
 def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
     # Accept single trace or list of traces
     if isinstance(traces, Trace):
@@ -442,7 +441,7 @@ def check_pydantic_ai_agent_input_output(
 def check_tool_output(span: ReadableSpan):
     try:
         return span.attributes.get("tool_response")
-    except Exception as e:
+    except Exception:
         pass
     return None

deepeval/tracing/tracing.py CHANGED Viewed

@@ -208,7 +208,13 @@ class TraceManager:
                 else:
                     # print(f"Ending trace: {trace.root_spans}")
                     self.environment = Environment.TESTING
-                    trace.root_spans = [trace.root_spans[0].children[0]]
+                    if (
+                        trace.root_spans
+                        and len(trace.root_spans) > 0
+                        and trace.root_spans[0].children
+                        and len(trace.root_spans[0].children) > 0
+                    ):
+                        trace.root_spans = [trace.root_spans[0].children[0]]
                     for root_span in trace.root_spans:
                         root_span.parent_uuid = None

deepeval/tracing/utils.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import os
-import time
 import inspect
 import json
 import sys
-import difflib
 from datetime import datetime, timezone
 from enum import Enum
 from time import perf_counter
-import time
 from collections import deque
-from typing import Any, Dict, Optional, Sequence, Callable
+from typing import Any, Dict, Optional
 from deepeval.constants import CONFIDENT_TRACING_ENABLED
@@ -189,8 +186,8 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
 def replace_self_with_class_name(obj):
     try:
         return f"<{obj.__class__.__name__}>"
-    except:
-        return f"<self>"
+    except Exception:
+        return "<self>"
 def get_deepeval_trace_mode() -> Optional[str]:

{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepeval
-Version: 3.6.1
+Version: 3.6.2
 Summary: The LLM Evaluation Framework
 Home-page: https://github.com/confident-ai/deepeval
 License: Apache-2.0
@@ -359,7 +359,7 @@ for golden in dataset.goldens:
 @pytest.mark.parametrize(
     "test_case",
-    dataset,
+    dataset.test_cases,
 )
 def test_customer_chatbot(test_case: LLMTestCase):
     answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)

{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
-deepeval/_version.py,sha256=60ky4ZrqXl83ooFzPWUHtPFcXD1XP6b9GQDnqw3EHOU,27
+deepeval/_version.py,sha256=3BMVt8jAt3lUkzkZWaFVDhhP9a-3lhvDGzjhGKNfjCo,27
 deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
 deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
 deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -159,7 +159,7 @@ deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
 deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
 deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
 deepeval/evaluate/evaluate.py,sha256=NPAJ2iJqJI_RurXKUIC0tft_ozYMIKwZf5iPfmnNhQc,10412
-deepeval/evaluate/execute.py,sha256=7RCjn2GGcjqK6cp9-0BtHL6PPJNw5-KXqXL60GN3G5Y,88672
+deepeval/evaluate/execute.py,sha256=XS0XtDGKC1ZOo09lthillfi5aDI5TWFbJ-Y7yICNvGo,89056
 deepeval/evaluate/types.py,sha256=IGZ3Xsj0UecPI3JNeTpJaK1gDvlepokfCmHwtItIW9M,831
 deepeval/evaluate/utils.py,sha256=kkliSGzuICeUsXDtlMMPfN95dUKlqarNhfciSffd4gI,23143
 deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -245,7 +245,7 @@ deepeval/metrics/g_eval/schema.py,sha256=V629txuDrr_2IEKEsgJVYYZb_pkdfcltQV9Zjvx
 deepeval/metrics/g_eval/template.py,sha256=mHj4-mr_HQwbCjpHg7lM_6UesoSatL3g8UGGQAOdT0U,4509
 deepeval/metrics/g_eval/utils.py,sha256=uUT86jRXVYvLDzcnZvvfWssDyGoBHb66nWcJSg4i1u4,8784
 deepeval/metrics/hallucination/__init__.py,sha256=rCVlHi2UGzDKmZKi0esFLafmshVBx2WZ0jiIb-KqcYQ,44
-deepeval/metrics/hallucination/hallucination.py,sha256=tozck1KwqDv80Nd449QH6_6mG15768eCGxmjoBsbzKw,9549
+deepeval/metrics/hallucination/hallucination.py,sha256=8JN5pj5YWRtl7rgbbFQF6EVBCGm1NV9vaX3_5tScNs4,9548
 deepeval/metrics/hallucination/schema.py,sha256=V8xbrBLMwJfre-lPuDc7rMEdhHf_1hfgoW1jE_ULvAY,286
 deepeval/metrics/hallucination/template.py,sha256=hiss1soxSBFqzOt0KmHZdZUzoQsmXnslDyb8HsjALPs,2620
 deepeval/metrics/indicator.py,sha256=oewo_n5Qet9Zfzo2QQs-EQ8w92siuyDCAmoTZW45ndc,10244
@@ -348,7 +348,7 @@ deepeval/metrics/task_completion/schema.py,sha256=JfnZkbCh7skWvrESy65GEo6Rvo0FDJ
 deepeval/metrics/task_completion/task_completion.py,sha256=RKFkXCVOhO70I8A16zv5BCaV3QVKldNxawJ0T93U_Zc,8978
 deepeval/metrics/task_completion/template.py,sha256=4xjTBcGrPQxInbf8iwJOZyok9SQex1aCkbxKmfkXoA4,10437
 deepeval/metrics/tool_correctness/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deepeval/metrics/tool_correctness/tool_correctness.py,sha256=8uyNFGM_aGFAB2aCv2CVDg5cjj0OJe8UVDqaT3Gp3kU,12090
+deepeval/metrics/tool_correctness/tool_correctness.py,sha256=4dS8o5pD2o9W2uDb-lFgulHpLI5kFhAlguWlffIreUU,11993
 deepeval/metrics/toxicity/__init__.py,sha256=1lgt8BKxfBDd7bfSLu_5kMzmsr9b2_ahPK9oq5zLkMk,39
 deepeval/metrics/toxicity/schema.py,sha256=7uUdzXqTvIIz5nfahlllo_fzVRXg7UeMeXn7Hl32pKY,459
 deepeval/metrics/toxicity/template.py,sha256=zl4y4Tg9gXkxKJ8aXVwj0cJ94pvfYuP7MTeV3dvB5yQ,5045
@@ -434,7 +434,7 @@ deepeval/telemetry.py,sha256=JPZw1VBJ5dGiS8k-dzWs5OhMbNlr65QgVretTy33WCg,21704
 deepeval/test_case/__init__.py,sha256=hLkHxGH0-FFhx4MlJwIbzNHL4pgyLGquh8l0qD-z_cQ,731
 deepeval/test_case/arena_test_case.py,sha256=PcfDxadlc4yW4AEDdvN32AeUpx2Sms1jvnbX31Xu65o,957
 deepeval/test_case/conversational_test_case.py,sha256=lF0V1yCGCInQetggm2wbXx-MkuMRs2ScwqIXCSwb1Fs,7534
-deepeval/test_case/llm_test_case.py,sha256=uWipuFVzKR3gYSpAbjK6GB_6XdtDMIRDNms-LyZYsuc,12117
+deepeval/test_case/llm_test_case.py,sha256=L-dCvJ4pMPPavZTyN9ZKN30h351DWI_TunmXfHPIjig,12180
 deepeval/test_case/mcp.py,sha256=Z625NLvz0E_UJpbyfyuAi_4nsqKH6DByBf0rfKd70xU,1879
 deepeval/test_case/mllm_test_case.py,sha256=8a0YoE72geX_fLI6yk_cObSxCPddwW-DOb-5OPE1-W8,5414
 deepeval/test_case/utils.py,sha256=5lT7QmhItsQHt44-qQfspuktilcrEyvl2cS0cgUJxds,809
@@ -454,15 +454,15 @@ deepeval/tracing/offline_evals/thread.py,sha256=bcSGFcZJKnszArOLIlWvnCyt0zSmsd7X
 deepeval/tracing/offline_evals/trace.py,sha256=vTflaTKysKRiYvKA-Nx6PUJ3J6NrRLXiIdWieVcm90E,1868
 deepeval/tracing/otel/__init__.py,sha256=HQsaF5yLPwyW5qg8AOV81_nG_7pFHnatOTHi9Wx3HEk,88
 deepeval/tracing/otel/exporter.py,sha256=wPO1ITKpjueLOSNLO6nD2QL9LAd8Xcu6en8hRkB61Wo,28891
-deepeval/tracing/otel/utils.py,sha256=THXOoqLau4w6Jlz0YJV3K3vQcVptxo14hcDQCJiPeks,14821
+deepeval/tracing/otel/utils.py,sha256=4FqCwOi-iYhuQ3GhAkbbmXbfhvSLGj9DAdfPCrUIccs,14738
 deepeval/tracing/patchers.py,sha256=DAPNkhrDtoeyJIVeQDUMhTz-xGcXu00eqjQZmov8FiU,3096
 deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
-deepeval/tracing/tracing.py,sha256=WFXfGLt58Ia9yCohDZBIUGX6mwieoF8489UziuC-NJI,42458
+deepeval/tracing/tracing.py,sha256=xZEyuxdGY259nQaDkGp_qO7Avriv8hrf4L15ZfeMNV8,42728
 deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
-deepeval/tracing/utils.py,sha256=RUcsDpS_aobK3zuNfZGNvjk7aBbBfHOj3aYu2hRZzg0,7993
+deepeval/tracing/utils.py,sha256=SLnks8apGlrV6uVnvFVl2mWYABEkvXbPXnQvq3KaU_o,7943
 deepeval/utils.py,sha256=-_o3W892u7naX4Y7a8if4mP0Rtkgtapg6Krr1ZBpj0o,17197
-deepeval-3.6.1.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
-deepeval-3.6.1.dist-info/METADATA,sha256=UrYM0bqzIvhmMlevcqO-Hcbbm2e5r26FwWEzz2rKua8,18743
-deepeval-3.6.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-deepeval-3.6.1.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
-deepeval-3.6.1.dist-info/RECORD,,
+deepeval-3.6.2.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
+deepeval-3.6.2.dist-info/METADATA,sha256=TZ7FbJUYYZ1w2P-qmLZdIHB0zv4TnZ4VeLBgN9Bq6Yo,18754
+deepeval-3.6.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+deepeval-3.6.2.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
+deepeval-3.6.2.dist-info/RECORD,,

{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepeval-3.6.1.dist-info → deepeval-3.6.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

deepeval 3.6.1__py3-none-any.whl → 3.6.2__py3-none-any.whl

deepeval 3.6.1py3-none-any.whl → 3.6.2py3-none-any.whl