PyPI - deepeval - Versions diffs - 3.6.4__py3-none-any.whl → 3.6.5__py3-none-any.whl - Mend

deepeval 3.6.4py3-none-any.whl → 3.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

deepeval/_version.py +1 -1
deepeval/config/settings.py +13 -0
deepeval/dataset/dataset.py +8 -2
deepeval/evaluate/evaluate.py +8 -2
deepeval/evaluate/execute.py +6 -11
deepeval/evaluate/types.py +4 -1
deepeval/evaluate/utils.py +46 -29
deepeval/integrations/crewai/__init__.py +1 -2
deepeval/integrations/crewai/handler.py +153 -81
deepeval/integrations/crewai/wrapper.py +87 -0
deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
deepeval/metrics/faithfulness/faithfulness.py +8 -0
deepeval/test_run/__init__.py +2 -1
deepeval/test_run/api.py +1 -0
deepeval/test_run/test_run.py +85 -9
deepeval/tracing/__init__.py +2 -0
deepeval/tracing/otel/test_exporter.py +35 -0
deepeval/tracing/trace_context.py +14 -0
deepeval/tracing/tracing.py +7 -6
deepeval/tracing/utils.py +2 -86
deepeval/utils.py +149 -1
{deepeval-3.6.4.dist-info → deepeval-3.6.5.dist-info}/METADATA +1 -1
{deepeval-3.6.4.dist-info → deepeval-3.6.5.dist-info}/RECORD +27 -25
deepeval/integrations/crewai/agent.py +0 -98
deepeval/integrations/crewai/patch.py +0 -41
{deepeval-3.6.4.dist-info → deepeval-3.6.5.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.4.dist-info → deepeval-3.6.5.dist-info}/WHEEL +0 -0
{deepeval-3.6.4.dist-info → deepeval-3.6.5.dist-info}/entry_points.txt +0 -0

deepeval/integrations/pydantic_ai/instrumentator.py CHANGED Viewed

@@ -1,7 +1,19 @@
 import json
+import logging
 import os
 from typing import Literal, Optional, List
+from deepeval.config.settings import get_settings
+from deepeval.confident.api import get_confident_api_key
+from deepeval.prompt import Prompt
+from deepeval.tracing.context import current_trace_context
+from deepeval.tracing.types import Trace
+from deepeval.tracing.otel.utils import to_hex_string
+logger = logging.getLogger(__name__)
 try:
     from pydantic_ai.models.instrumented import InstrumentationSettings
     from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
@@ -11,7 +23,20 @@ try:
     )
     dependency_installed = True
-except:
+except ImportError as e:
+    if get_settings().DEEPEVAL_VERBOSE_MODE:
+        if isinstance(e, ModuleNotFoundError):
+            logger.warning(
+                "Optional tracing dependency not installed: %s",
+                e.name,
+                stacklevel=2,
+            )
+        else:
+            logger.warning(
+                "Optional tracing import failed: %s",
+                e,
+                stacklevel=2,
+            )
     dependency_installed = False
@@ -25,6 +50,10 @@ def is_dependency_installed():
 from deepeval.confident.api import get_confident_api_key
 from deepeval.prompt import Prompt
+from deepeval.tracing.otel.test_exporter import test_exporter
+from deepeval.tracing.context import current_trace_context
+from deepeval.tracing.types import Trace
+from deepeval.tracing.otel.utils import to_hex_string
 # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
 OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
     def on_start(self, span, parent_context):
+        # set trace uuid
+        _current_trace_context = current_trace_context.get()
+        if _current_trace_context and isinstance(_current_trace_context, Trace):
+            _otel_trace_id = span.get_span_context().trace_id
+            _current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
         # set trace attributes
         if self.settings.thread_id:
             span.set_attribute(
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
         confident_prompt: Optional[Prompt] = None,
         llm_metric_collection: Optional[str] = None,
         agent_metric_collection: Optional[str] = None,
-        tool_metric_collection_map: dict = {},
+        tool_metric_collection_map: Optional[dict] = None,
         trace_metric_collection: Optional[str] = None,
+        is_test_mode: Optional[bool] = False,
     ):
         is_dependency_installed()
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
         ]:
             self.environment = _environment
-        self.tool_metric_collection_map = tool_metric_collection_map
+        self.tool_metric_collection_map = tool_metric_collection_map or {}
         self.name = name
         self.thread_id = thread_id
         self.user_id = user_id
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
         span_interceptor = SpanInterceptor(self)
         trace_provider.add_span_processor(span_interceptor)
-        trace_provider.add_span_processor(
-            BatchSpanProcessor(
-                OTLPSpanExporter(
-                    endpoint=OTLP_ENDPOINT,
-                    headers={"x-confident-api-key": api_key},
+        if is_test_mode:
+            trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
+        else:
+            trace_provider.add_span_processor(
+                BatchSpanProcessor(
+                    OTLPSpanExporter(
+                        endpoint=OTLP_ENDPOINT,
+                        headers={"x-confident-api-key": api_key},
+                    )
                 )
             )
-        )
         super().__init__(tracer_provider=trace_provider)

deepeval/integrations/pydantic_ai/test_instrumentator.py ADDED Viewed

File without changes

deepeval/metrics/faithfulness/faithfulness.py CHANGED Viewed

@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
         strict_mode: bool = False,
         verbose_mode: bool = False,
         truths_extraction_limit: Optional[int] = None,
+        penalize_ambiguous_claims: bool = False,
         evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
     ):
         self.threshold = 1 if strict_mode else threshold
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
         self.strict_mode = strict_mode
         self.verbose_mode = verbose_mode
         self.evaluation_template = evaluation_template
+        self.penalize_ambiguous_claims = penalize_ambiguous_claims
         self.truths_extraction_limit = truths_extraction_limit
         if self.truths_extraction_limit is not None:
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
             if verdict.verdict.strip().lower() != "no":
                 faithfulness_count += 1
+            if (
+                self.penalize_ambiguous_claims
+                and verdict.verdict.strip().lower() == "idk"
+            ):
+                faithfulness_count -= 1
         score = faithfulness_count / number_of_verdicts
         return 0 if self.strict_mode and score < self.threshold else score

deepeval/test_run/__init__.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .test_run import (
 )
 from .hooks import on_test_run_end, invoke_test_run_end_hook
-from .api import MetricData
+from .api import MetricData, TurnApi
 from .hyperparameters import log_hyperparameters
@@ -28,5 +28,6 @@ __all__ = [
     "on_test_run_end",
     "invoke_test_run_end_hook",
     "MetricData",
+    "TurnApi",
     "log_hyperparameters",
 ]

deepeval/test_run/api.py CHANGED Viewed

@@ -99,6 +99,7 @@ class TurnApi(BaseModel):
     role: str
     content: str
     order: int
+    user_id: Optional[str] = Field(None, alias="userId")
     retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
     tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
     additional_metadata: Optional[Dict] = Field(

deepeval/test_run/test_run.py CHANGED Viewed

@@ -2,9 +2,8 @@ from enum import Enum
 import os
 import json
 from pydantic import BaseModel, Field
-from typing import Any, Optional, List, Dict, Union
+from typing import Any, Optional, List, Dict, Union, Tuple
 import shutil
-import webbrowser
 import sys
 import datetime
 import portalocker
@@ -27,6 +26,9 @@ from deepeval.utils import (
     delete_file_if_exists,
     get_is_running_deepeval,
     open_browser,
+    shorten,
+    format_turn,
+    len_short,
 )
 from deepeval.test_run.cache import global_test_run_cache_manager
 from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
@@ -546,7 +548,7 @@ class TestRunManager:
             if (
                 display == TestRunResultDisplay.PASSING
-                and test_case.success == False
+                and test_case.success is False
             ):
                 continue
             elif display == TestRunResultDisplay.FAILING and test_case.success:
@@ -618,7 +620,7 @@ class TestRunManager:
         ):
             if (
                 display == TestRunResultDisplay.PASSING
-                and conversational_test_case.success == False
+                and conversational_test_case.success is False
             ):
                 continue
             elif (
@@ -631,6 +633,65 @@ class TestRunManager:
             fail_count = 0
             conversational_test_case_name = conversational_test_case.name
+            if conversational_test_case.turns:
+                turns_table = Table(
+                    title=f"Conversation - {conversational_test_case_name}",
+                    show_header=True,
+                    header_style="bold",
+                )
+                turns_table.add_column("#", justify="right", width=3)
+                turns_table.add_column("Role", justify="left", width=10)
+                # subtract fixed widths + borders and padding.
+                # ~20 as a safe buffer
+                details_max_width = max(
+                    48, min(120, console.width - 3 - 10 - 20)
+                )
+                turns_table.add_column(
+                    "Details",
+                    justify="left",
+                    overflow="fold",
+                    max_width=details_max_width,
+                )
+                # truncate when too long
+                tools_max_width = min(60, max(24, console.width // 3))
+                turns_table.add_column(
+                    "Tools",
+                    justify="left",
+                    no_wrap=True,
+                    overflow="ellipsis",
+                    max_width=tools_max_width,
+                )
+                sorted_turns = sorted(
+                    conversational_test_case.turns, key=lambda t: t.order
+                )
+                for t in sorted_turns:
+                    tools = t.tools_called or []
+                    tool_names = ", ".join(tc.name for tc in tools)
+                    # omit order, role and tools since we show them in a separate columns.
+                    details = format_turn(
+                        t,
+                        include_tools_in_header=False,
+                        include_order_role_in_header=False,
+                    )
+                    turns_table.add_row(
+                        str(t.order),
+                        t.role,
+                        details,
+                        shorten(tool_names, len_short()),
+                    )
+                console.print(turns_table)
+            else:
+                console.print(
+                    f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
+                )
             if conversational_test_case.metrics_data is not None:
                 for metric_data in conversational_test_case.metrics_data:
                     if metric_data.success:
@@ -698,7 +759,7 @@ class TestRunManager:
         )
         print(table)
-    def post_test_run(self, test_run: TestRun) -> Optional[str]:
+    def post_test_run(self, test_run: TestRun) -> Optional[Tuple[str, str]]:
         if (
             len(test_run.test_cases) == 0
             and len(test_run.conversational_test_cases) == 0
@@ -752,6 +813,21 @@ class TestRunManager:
             body=body,
         )
+        if not isinstance(data, dict) or "id" not in data:
+            # try to show helpful details
+            detail = None
+            if isinstance(data, dict):
+                detail = (
+                    data.get("detail")
+                    or data.get("message")
+                    or data.get("error")
+                )
+            # fall back to repr for visibility
+            raise RuntimeError(
+                f"Confident API response missing 'id'. "
+                f"detail={detail!r} raw={type(data).__name__}:{repr(data)[:500]}"
+            )
         res = TestRunHttpResponse(
             id=data["id"],
         )
@@ -814,7 +890,7 @@ class TestRunManager:
         )
         self.save_final_test_run_link(link)
         open_browser(link)
-        return link
+        return link, res.id
     def save_test_run_locally(self):
         local_folder = os.getenv("DEEPEVAL_RESULTS_FOLDER")
@@ -841,7 +917,7 @@ class TestRunManager:
         runDuration: float,
         display_table: bool = True,
         display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
-    ) -> Optional[str]:
+    ) -> Optional[Tuple[str, str]]:
         test_run = self.get_test_run()
         if test_run is None:
             print("Test Run is empty, please try again.")
@@ -868,8 +944,8 @@ class TestRunManager:
         test_run.sort_test_cases()
         if global_test_run_cache_manager.disable_write_cache is None:
-            global_test_run_cache_manager.disable_write_cache = (
-                get_is_running_deepeval() == False
+            global_test_run_cache_manager.disable_write_cache = not bool(
+                get_is_running_deepeval()
             )
         global_test_run_cache_manager.wrap_up_cached_test_run()

deepeval/tracing/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .context import (
     update_retriever_span,
     update_llm_span,
 )
+from .trace_context import trace
 from .types import BaseSpan, Trace
 from .tracing import observe, trace_manager
 from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
@@ -16,6 +17,7 @@ __all__ = [
     "BaseSpan",
     "Trace",
     "observe",
+    "trace",
     "trace_manager",
     "evaluate_thread",
     "evaluate_trace",

deepeval/tracing/otel/test_exporter.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import List, Dict, Any, Sequence
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.sdk.trace.export import SpanExporter
+from opentelemetry.sdk.trace.export import SpanExportResult
+import json
+from datetime import datetime
+class TestExporter(SpanExporter):
+    """This exporter is used to test the exporter. It will store the spans in a list of dictionaries."""
+    span_json_list: List[Dict[str, Any]] = []
+    def export(
+        self, spans: Sequence[ReadableSpan], timeout_millis: int = 30000
+    ) -> SpanExportResult:
+        for span in spans:
+            _span_json = json.loads(span.to_json())
+            self.span_json_list.append(_span_json)
+        return SpanExportResult.SUCCESS
+    def get_span_json_list(self) -> List[Dict[str, Any]]:
+        return sorted(
+            self.span_json_list,
+            key=lambda x: datetime.fromisoformat(
+                x["start_time"].replace("Z", "+00:00")
+            ),
+        )
+    def clear_span_json_list(self):
+        self.span_json_list = []
+test_exporter = TestExporter()

deepeval/tracing/trace_context.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .context import current_trace_context
+from .tracing import trace_manager
+from contextlib import contextmanager
+@contextmanager
+def trace():
+    current_trace = current_trace_context.get()
+    if not current_trace:
+        current_trace = trace_manager.start_new_trace()
+        current_trace_context.set(current_trace)
+    yield current_trace

deepeval/tracing/tracing.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from typing import Any, Dict, List, Literal, Optional, Set, Union, Callable
 from time import perf_counter
 import threading
@@ -47,13 +48,12 @@ from deepeval.tracing.utils import (
     tracing_enabled,
     validate_environment,
     validate_sampling_rate,
-    dump_body_to_json_file,
-    get_deepeval_trace_mode,
 )
 from deepeval.utils import dataclass_to_dict
 from deepeval.tracing.context import current_span_context, current_trace_context
 from deepeval.tracing.types import TestCaseMetricPair
 from deepeval.tracing.api import PromptApi
+from tests.test_integrations.manager import trace_testing_manager
 EVAL_DUMMY_SPAN_NAME = "evals_iterator"
@@ -183,13 +183,14 @@ class TraceManager:
             if trace.status == TraceSpanStatus.IN_PROGRESS:
                 trace.status = TraceSpanStatus.SUCCESS
-            mode = get_deepeval_trace_mode()
-            if mode == "gen":
+            if trace_testing_manager.test_name:
+                # Trace testing mode is enabled
+                # Instead posting the trace to the queue, it will be stored in this global variable
                 body = self.create_trace_api(trace).model_dump(
                     by_alias=True, exclude_none=True
                 )
-                dump_body_to_json_file(body)
-            # Post the trace to the server before removing it
+                trace_testing_manager.test_dict = make_json_serializable(body)
+            #  Post the trace to the server before removing it
             elif not self.evaluating:
                 self.post_trace(trace)
             else:

deepeval/tracing/utils.py CHANGED Viewed

@@ -1,13 +1,8 @@
 import os
-import inspect
-import json
-import sys
 from datetime import datetime, timezone
 from enum import Enum
 from time import perf_counter
 from collections import deque
-from typing import Any, Dict, Optional
 from deepeval.constants import CONFIDENT_TRACING_ENABLED
@@ -186,84 +181,5 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
 def replace_self_with_class_name(obj):
     try:
         return f"<{obj.__class__.__name__}>"
-    except Exception:
-        return "<self>"
-def get_deepeval_trace_mode() -> Optional[str]:
-    deepeval_trace_mode = None
-    try:
-        args = sys.argv
-        for idx, arg in enumerate(args):
-            if isinstance(arg, str) and arg.startswith(
-                "--deepeval-trace-mode="
-            ):
-                deepeval_trace_mode = (
-                    arg.split("=", 1)[1].strip().strip('"').strip("'").lower()
-                )
-                break
-            if arg == "--deepeval-trace-mode" and idx + 1 < len(args):
-                deepeval_trace_mode = (
-                    str(args[idx + 1]).strip().strip('"').strip("'").lower()
-                )
-                break
-    except Exception:
-        deepeval_trace_mode = None
-    return deepeval_trace_mode
-def dump_body_to_json_file(
-    body: Dict[str, Any], file_path: Optional[str] = None
-) -> str:
-    entry_file = None
-    try:
-        cmd0 = sys.argv[0] if sys.argv else None
-        if cmd0 and cmd0.endswith(".py"):
-            entry_file = cmd0
-        else:
-            for frame_info in reversed(inspect.stack()):
-                fp = frame_info.filename
-                if (
-                    fp
-                    and fp.endswith(".py")
-                    and "deepeval/tracing" not in fp
-                    and "site-packages" not in fp
-                ):
-                    entry_file = fp
-                    break
-    except Exception:
-        entry_file = None
-    if not entry_file:
-        entry_file = "unknown.py"
-    abs_entry = os.path.abspath(entry_file)
-    dir_path = os.path.dirname(abs_entry)
-    file_arg = None
-    try:
-        for idx, arg in enumerate(sys.argv):
-            if isinstance(arg, str) and arg.startswith(
-                "--deepeval-trace-file-name="
-            ):
-                file_arg = arg.split("=", 1)[1].strip().strip('"').strip("'")
-                break
-            if arg == "--deepeval-trace-file-name" and idx + 1 < len(sys.argv):
-                file_arg = str(sys.argv[idx + 1]).strip().strip('"').strip("'")
-                break
-    except Exception:
-        file_arg = None
-    if file_path:
-        dst_path = os.path.abspath(file_path)
-    elif file_arg:
-        dst_path = os.path.abspath(file_arg)
-    else:
-        base_name = os.path.splitext(os.path.basename(abs_entry))[0]
-        dst_path = os.path.join(dir_path, f"{base_name}.json")
-    actual_body = make_json_serializable(body)
-    with open(dst_path, "w", encoding="utf-8") as f:
-        json.dump(actual_body, f, ensure_ascii=False, indent=2, sort_keys=True)
-    return dst_path
+    except:
+        return f"<self>"

deepeval 3.6.4__py3-none-any.whl → 3.6.5__py3-none-any.whl

deepeval 3.6.4py3-none-any.whl → 3.6.5py3-none-any.whl