PyPI - deepeval - Versions diffs - 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl - Mend

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/settings.py +22 -4
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/errors.py +20 -2
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +811 -248
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +33 -119
deepeval/integrations/crewai/__init__.py +7 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +11 -0
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +20 -9
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +20 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +4 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +85 -50
deepeval/openai/patch.py +258 -167
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +205 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +245 -62
deepeval/prompt/utils.py +186 -15
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +19 -15
deepeval/test_case/api.py +131 -0
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +292 -206
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +24 -5
deepeval/tracing/trace_context.py +89 -5
deepeval/tracing/tracing.py +74 -3
deepeval/tracing/types.py +20 -2
deepeval/tracing/utils.py +8 -0
deepeval/utils.py +21 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
deepeval/integrations/llama_index/agent/patched.py +0 -68
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0

deepeval/test_run/test_run.py CHANGED Viewed

@@ -32,6 +32,15 @@ from deepeval.utils import (
 )
 from deepeval.test_run.cache import global_test_run_cache_manager
 from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
+from deepeval.prompt import (
+    PromptMessage,
+    ModelSettings,
+    PromptInterpolationType,
+    OutputType,
+)
+from rich.panel import Panel
+from rich.columns import Columns
 TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
 LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
@@ -71,6 +80,16 @@ class TraceMetricScores(BaseModel):
     base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
+class PromptData(BaseModel):
+    alias: Optional[str] = None
+    version: Optional[str] = None
+    text_template: Optional[str] = None
+    messages_template: Optional[List[PromptMessage]] = None
+    model_settings: Optional[ModelSettings] = None
+    output_type: Optional[OutputType] = None
+    interpolation_type: Optional[PromptInterpolationType] = None
 class MetricsAverageDict:
     def __init__(self):
         self.metric_dict = {}
@@ -123,6 +142,7 @@ class TestRun(BaseModel):
     )
     identifier: Optional[str] = None
     hyperparameters: Optional[Dict[str, Any]] = Field(None)
+    prompts: Optional[List[PromptData]] = Field(None)
     test_passed: Optional[int] = Field(None, alias="testPassed")
     test_failed: Optional[int] = Field(None, alias="testFailed")
     run_duration: float = Field(0.0, alias="runDuration")
@@ -191,65 +211,91 @@ class TestRun(BaseModel):
         valid_scores = 0
         def process_metric_data(metric_data: MetricData):
+            """
+            Process and aggregate metric data for overall test metrics.
+            Args:
+                metric_data: The metric data to process
+            """
             nonlocal valid_scores
-            name = metric_data.name
+            metric_name = metric_data.name
             score = metric_data.score
             success = metric_data.success
-            # Initialize dict entry if needed.
-            if name not in metrics_dict:
-                metrics_dict[name] = {
+            if metric_name not in metrics_dict:
+                metrics_dict[metric_name] = {
                     "scores": [],
                     "passes": 0,
                     "fails": 0,
                     "errors": 0,
                 }
+            metric_dict = metrics_dict[metric_name]
             if score is None or success is None:
-                metrics_dict[name]["errors"] += 1
+                metric_dict["errors"] += 1
             else:
                 valid_scores += 1
-                # Append the score.
-                metrics_dict[name]["scores"].append(score)
-                # Increment passes or fails based on the metric_data.success flag.
+                metric_dict["scores"].append(score)
                 if success:
-                    metrics_dict[name]["passes"] += 1
+                    metric_dict["passes"] += 1
                 else:
-                    metrics_dict[name]["fails"] += 1
+                    metric_dict["fails"] += 1
         def process_span_metric_data(
-            metric_data: MetricData, type: span_api_type_literals, name: str
+            metric_data: MetricData,
+            span_type: span_api_type_literals,
+            span_name: str,
         ):
+            """
+            Process and aggregate metric data for a specific span.
+            Args:
+                metric_data: The metric data to process
+                span_type: The type of span (agent, tool, retriever, llm, base)
+                span_name: The name of the span
+            """
             metric_name = metric_data.name
             score = metric_data.score
             success = metric_data.success
-            # Initialize the structure if needed
-            if name not in trace_metrics_dict[type]:
-                trace_metrics_dict[type][name] = {}
+            if span_name not in trace_metrics_dict[span_type]:
+                trace_metrics_dict[span_type][span_name] = {}
-            if metric_name not in trace_metrics_dict[type][name]:
-                trace_metrics_dict[type][name][metric_name] = {
+            if metric_name not in trace_metrics_dict[span_type][span_name]:
+                trace_metrics_dict[span_type][span_name][metric_name] = {
                     "scores": [],
                     "passes": 0,
                     "fails": 0,
                     "errors": 0,
                 }
+            metric_dict = trace_metrics_dict[span_type][span_name][metric_name]
             if score is None or success is None:
-                trace_metrics_dict[type][name][metric_name]["errors"] += 1
+                metric_dict["errors"] += 1
             else:
-                # Append the score
-                trace_metrics_dict[type][name][metric_name]["scores"].append(
-                    score
-                )
-                # Increment passes or fails
+                metric_dict["scores"].append(score)
                 if success:
-                    trace_metrics_dict[type][name][metric_name]["passes"] += 1
+                    metric_dict["passes"] += 1
                 else:
-                    trace_metrics_dict[type][name][metric_name]["fails"] += 1
+                    metric_dict["fails"] += 1
+        def process_spans(spans, span_type: span_api_type_literals):
+            """
+            Process all metrics for a list of spans of a specific type.
+            Args:
+                spans: List of spans to process
+                span_type: The type of spans being processed
+            """
+            for span in spans:
+                if span.metrics_data is not None:
+                    for metric_data in span.metrics_data:
+                        process_metric_data(metric_data)
+                        process_span_metric_data(
+                            metric_data, span_type, span.name
+                        )
         # Process non-conversational test cases.
         for test_case in self.test_cases:
@@ -261,45 +307,14 @@ class TestRun(BaseModel):
             if test_case.trace is None:
                 continue
-            for span in test_case.trace.agent_spans:
-                if span.metrics_data is not None:
-                    for metric_data in span.metrics_data:
-                        process_metric_data(metric_data)
-                        process_span_metric_data(
-                            metric_data, SpanApiType.AGENT.value, span.name
-                        )
-            for span in test_case.trace.tool_spans:
-                if span.metrics_data is not None:
-                    for metric_data in span.metrics_data:
-                        process_metric_data(metric_data)
-                        process_span_metric_data(
-                            metric_data, SpanApiType.TOOL.value, span.name
-                        )
-            for span in test_case.trace.retriever_spans:
-                if span.metrics_data is not None:
-                    for metric_data in span.metrics_data:
-                        process_metric_data(metric_data)
-                        process_span_metric_data(
-                            metric_data, SpanApiType.RETRIEVER.value, span.name
-                        )
-            for span in test_case.trace.llm_spans:
-                if span.metrics_data is not None:
-                    for metric_data in span.metrics_data:
-                        process_metric_data(metric_data)
-                        process_span_metric_data(
-                            metric_data, SpanApiType.LLM.value, span.name
-                        )
-            for span in test_case.trace.base_spans:
-                if span.metrics_data is not None:
-                    for metric_data in span.metrics_data:
-                        process_metric_data(metric_data)
-                        process_span_metric_data(
-                            metric_data, SpanApiType.BASE.value, span.name
-                        )
+            # Process all span types using the helper function
+            process_spans(test_case.trace.agent_spans, SpanApiType.AGENT.value)
+            process_spans(test_case.trace.tool_spans, SpanApiType.TOOL.value)
+            process_spans(
+                test_case.trace.retriever_spans, SpanApiType.RETRIEVER.value
+            )
+            process_spans(test_case.trace.llm_spans, SpanApiType.LLM.value)
+            process_spans(test_case.trace.base_spans, SpanApiType.BASE.value)
         # Process conversational test cases.
         for convo_test_case in self.conversational_test_cases:
@@ -532,105 +547,141 @@ class TestRunManager:
     def clear_test_run(self):
         self.test_run = None
-    def display_results_table(
-        self, test_run: TestRun, display: TestRunResultDisplay
-    ):
-        table = Table(title="Test Results")
-        table.add_column("Test case", justify="left")
-        table.add_column("Metric", justify="left")
-        table.add_column("Score", justify="left")
-        table.add_column("Status", justify="left")
-        table.add_column("Overall Success Rate", justify="left")
-        for index, test_case in enumerate(test_run.test_cases):
-            if test_case.metrics_data is None:
-                continue
+    @staticmethod
+    def _calculate_success_rate(pass_count: int, fail_count: int) -> str:
+        """Calculate success rate percentage or return error message."""
+        total = pass_count + fail_count
+        if total > 0:
+            return str(round((100 * pass_count) / total, 2))
+        return "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
+    @staticmethod
+    def _get_metric_status(metric_data: MetricData) -> str:
+        """Get formatted status string for a metric."""
+        if metric_data.error:
+            return "[red]ERRORED[/red]"
+        elif metric_data.success:
+            return "[green]PASSED[/green]"
+        return "[red]FAILED[/red]"
+    @staticmethod
+    def _format_metric_score(metric_data: MetricData) -> str:
+        """Format metric score with evaluation details."""
+        evaluation_model = metric_data.evaluation_model or "n/a"
+        metric_score = (
+            round(metric_data.score, 2)
+            if metric_data.score is not None
+            else None
+        )
-            if (
-                display == TestRunResultDisplay.PASSING
-                and test_case.success is False
-            ):
-                continue
-            elif display == TestRunResultDisplay.FAILING and test_case.success:
-                continue
+        return (
+            f"{metric_score} "
+            f"(threshold={metric_data.threshold}, "
+            f"evaluation model={evaluation_model}, "
+            f"reason={metric_data.reason}, "
+            f"error={metric_data.error})"
+        )
-            pass_count = 0
-            fail_count = 0
-            test_case_name = test_case.name
+    @staticmethod
+    def _should_skip_test_case(
+        test_case, display: TestRunResultDisplay
+    ) -> bool:
+        """Determine if test case should be skipped based on display filter."""
+        if display == TestRunResultDisplay.PASSING and not test_case.success:
+            return True
+        elif display == TestRunResultDisplay.FAILING and test_case.success:
+            return True
+        return False
+    @staticmethod
+    def _count_metric_results(
+        metrics_data: List[MetricData],
+    ) -> tuple[int, int]:
+        """Count passing and failing metrics."""
+        pass_count = 0
+        fail_count = 0
+        for metric_data in metrics_data:
+            if metric_data.success:
+                pass_count += 1
+            else:
+                fail_count += 1
+        return pass_count, fail_count
-            # TODO: recursively iterate through it to calculate pass and fail count
-            if test_case.trace:
-                pass
+    def _add_test_case_header_row(
+        self,
+        table: Table,
+        test_case_name: str,
+        pass_count: int,
+        fail_count: int,
+    ):
+        """Add test case header row with name and success rate."""
+        success_rate = self._calculate_success_rate(pass_count, fail_count)
+        table.add_row(
+            test_case_name,
+            *[""] * 3,
+            f"{success_rate}%",
+        )
-            for metric_data in test_case.metrics_data:
-                if metric_data.success:
-                    pass_count += 1
-                else:
-                    fail_count += 1
+    def _add_metric_rows(self, table: Table, metrics_data: List[MetricData]):
+        """Add metric detail rows to the table."""
+        for metric_data in metrics_data:
+            status = self._get_metric_status(metric_data)
+            formatted_score = self._format_metric_score(metric_data)
-            success_rate = (
-                round((100 * pass_count) / (pass_count + fail_count), 2)
-                if pass_count + fail_count > 0
-                else "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
-            )
             table.add_row(
-                test_case_name,
                 "",
+                str(metric_data.name),
+                formatted_score,
+                status,
                 "",
-                "",
-                f"{success_rate}%",
             )
-            for metric_data in test_case.metrics_data:
-                if metric_data.error:
-                    status = "[red]ERRORED[/red]"
-                elif metric_data.success:
-                    status = "[green]PASSED[/green]"
-                else:
-                    status = "[red]FAILED[/red]"
+    def _add_separator_row(self, table: Table):
+        """Add empty separator row between test cases."""
+        table.add_row(*[""] * len(table.columns))
-                evaluation_model = metric_data.evaluation_model
-                if evaluation_model is None:
-                    evaluation_model = "n/a"
+    def display_results_table(
+        self, test_run: TestRun, display: TestRunResultDisplay
+    ):
+        """Display test results in a formatted table."""
-                if metric_data.score is not None:
-                    metric_score = round(metric_data.score, 2)
-                else:
-                    metric_score = None
-                table.add_row(
-                    "",
-                    str(metric_data.name),
-                    f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
-                    status,
-                    "",
-                )
+        table = Table(title="Test Results")
+        column_config = dict(justify="left")
+        column_names = [
+            "Test case",
+            "Metric",
+            "Score",
+            "Status",
+            "Overall Success Rate",
+        ]
-            if index is not len(self.test_run.test_cases) - 1:
-                table.add_row(
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                )
+        for name in column_names:
+            table.add_column(name, **column_config)
+        # Process regular test cases
+        for index, test_case in enumerate(test_run.test_cases):
+            if test_case.metrics_data is None or self._should_skip_test_case(
+                test_case, display
+            ):
+                continue
+            pass_count, fail_count = self._count_metric_results(
+                test_case.metrics_data
+            )
+            self._add_test_case_header_row(
+                table, test_case.name, pass_count, fail_count
+            )
+            self._add_metric_rows(table, test_case.metrics_data)
+            if index < len(test_run.test_cases) - 1:
+                self._add_separator_row(table)
+        # Process conversational test cases
         for index, conversational_test_case in enumerate(
             test_run.conversational_test_cases
         ):
-            if (
-                display == TestRunResultDisplay.PASSING
-                and conversational_test_case.success is False
-            ):
-                continue
-            elif (
-                display == TestRunResultDisplay.FAILING
-                and conversational_test_case.success
-            ):
+            if self._should_skip_test_case(conversational_test_case, display):
                 continue
-            pass_count = 0
-            fail_count = 0
             conversational_test_case_name = conversational_test_case.name
             if conversational_test_case.turns:
@@ -691,71 +742,26 @@ class TestRunManager:
                 console.print(
                     f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
                 )
             if conversational_test_case.metrics_data is not None:
-                for metric_data in conversational_test_case.metrics_data:
-                    if metric_data.success:
-                        pass_count += 1
-                    else:
-                        fail_count += 1
-                table.add_row(
-                    conversational_test_case_name,
-                    "",
-                    "",
-                    "",
-                    f"{round((100*pass_count)/(pass_count+fail_count),2)}%",
+                pass_count, fail_count = self._count_metric_results(
+                    conversational_test_case.metrics_data
                 )
-            if conversational_test_case.metrics_data is not None:
-                for metric_data in conversational_test_case.metrics_data:
-                    if metric_data.error:
-                        status = "[red]ERRORED[/red]"
-                    elif metric_data.success:
-                        status = "[green]PASSED[/green]"
-                    else:
-                        status = "[red]FAILED[/red]"
-                    evaluation_model = metric_data.evaluation_model
-                    if evaluation_model is None:
-                        evaluation_model = "n/a"
-                    if metric_data.score is not None:
-                        metric_score = round(metric_data.score, 2)
-                    else:
-                        metric_score = None
-                    table.add_row(
-                        "",
-                        str(metric_data.name),
-                        f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
-                        status,
-                        "",
-                    )
-            if index is not len(self.test_run.conversational_test_cases) - 1:
-                table.add_row(
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
+                self._add_test_case_header_row(
+                    table, conversational_test_case.name, pass_count, fail_count
                 )
-            if index is not len(self.test_run.test_cases) - 1:
-                table.add_row(
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
+                self._add_metric_rows(
+                    table, conversational_test_case.metrics_data
                 )
+            if index < len(test_run.conversational_test_cases) - 1:
+                self._add_separator_row(table)
+            if index < len(test_run.test_cases) - 1:
+                self._add_separator_row(table)
         table.add_row(
             "[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
-            "",
-            "",
-            "",
-            "",
+            *[""] * (len(table.columns) - 1),
         )
         print(table)
@@ -799,6 +805,7 @@ class TestRunManager:
             test_run.test_cases = initial_batch
         try:
+            test_run.prompts = None
             body = test_run.model_dump(by_alias=True, exclude_none=True)
         except AttributeError:
             # Pydantic version below 2.0
@@ -947,12 +954,28 @@ class TestRunManager:
             global_test_run_cache_manager.disable_write_cache = not bool(
                 get_is_running_deepeval()
             )
         global_test_run_cache_manager.wrap_up_cached_test_run()
         if display_table:
             self.display_results_table(test_run, display)
+        if test_run.hyperparameters is None:
+            console.print(
+                "\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
+                "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
+                + "=" * 80
+            )
+        else:
+            if not test_run.prompts:
+                console.print(
+                    "\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
+                    "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
+                    + "=" * 80
+                )
+            else:
+                console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
+                self._render_prompts_panels(prompts=test_run.prompts)
         self.save_test_run_locally()
         delete_file_if_exists(self.temp_file_path)
         if is_confident() and self.disable_request is False:
@@ -967,7 +990,7 @@ class TestRunManager:
                 f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
                 f"  » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
                 "=" * 80,
-                "\n\n» What to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
+                "\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
                 "  » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
             )
@@ -993,5 +1016,68 @@ class TestRunManager:
             pass
         return None
+    def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
+        def format_string(
+            v, default="[dim]None[/dim]", color: Optional[str] = None
+        ):
+            formatted_string = str(v) if v not in (None, "", []) else default
+            return (
+                f"{formatted_string}"
+                if color is None or v in (None, "", [])
+                else f"[{color}]{formatted_string}[/]"
+            )
+        panels = []
+        for prompt in prompts:
+            lines = []
+            p_type = (
+                "messages"
+                if prompt.messages_template
+                else ("text" if prompt.text_template else "—")
+            )
+            if p_type:
+                lines.append(f"type: {format_string(p_type, color='blue')}")
+            if prompt.output_type:
+                lines.append(
+                    f"output_type: {format_string(prompt.output_type, color='blue')}"
+                )
+            if prompt.interpolation_type:
+                lines.append(
+                    f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
+                )
+            if prompt.model_settings:
+                ms = prompt.model_settings
+                settings_lines = [
+                    "Model Settings:",
+                    f"  – provider: {format_string(ms.provider, color='green')}",
+                    f"  – name: {format_string(ms.name, color='green')}",
+                    f"  – temperature: {format_string(ms.temperature, color='green')}",
+                    f"  – max_tokens: {format_string(ms.max_tokens, color='green')}",
+                    f"  – top_p: {format_string(ms.top_p, color='green')}",
+                    f"  – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
+                    f"  – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
+                    f"  – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
+                    f"  – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
+                    f"  – verbosity: {format_string(ms.verbosity, color='green')}",
+                ]
+                lines.append("")
+                lines.extend(settings_lines)
+            title = f"{format_string(prompt.alias)}"
+            if prompt.version:
+                title += f" (v{prompt.version})"
+            body = "\n".join(lines)
+            panel = Panel(
+                body,
+                title=title,
+                title_align="left",
+                expand=False,
+                padding=(1, 6, 1, 2),
+            )
+            panels.append(panel)
+        if panels:
+            console.print(Columns(panels, equal=False, expand=False))
 global_test_run_manager = TestRunManager()

deepeval/tracing/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from .context import (
     update_retriever_span,
     update_llm_span,
 )
-from .trace_context import trace
+from .trace_context import trace, LlmSpanContext
 from .types import BaseSpan, Trace
 from .tracing import observe, trace_manager
 from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
@@ -14,6 +14,7 @@ __all__ = [
     "update_current_trace",
     "update_retriever_span",
     "update_llm_span",
+    "LlmSpanContext",
     "BaseSpan",
     "Trace",
     "observe",

deepeval/tracing/api.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from enum import Enum
 from typing import Dict, List, Optional, Union, Literal, Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 from deepeval.test_case import ToolCall
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
 class MetricData(BaseModel):
+    model_config = ConfigDict(extra="ignore")
     name: str
     threshold: float
     success: bool

deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl