PyPI - deepeval - Versions diffs - 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl - Mend

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/settings.py +22 -4
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/errors.py +20 -2
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +811 -248
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +33 -119
deepeval/integrations/crewai/__init__.py +7 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +11 -0
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +20 -9
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +20 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +4 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +85 -50
deepeval/openai/patch.py +258 -167
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +205 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +245 -62
deepeval/prompt/utils.py +186 -15
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +19 -15
deepeval/test_case/api.py +131 -0
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +292 -206
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +24 -5
deepeval/tracing/trace_context.py +89 -5
deepeval/tracing/tracing.py +74 -3
deepeval/tracing/types.py +20 -2
deepeval/tracing/utils.py +8 -0
deepeval/utils.py +21 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
deepeval/integrations/llama_index/agent/patched.py +0 -68
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0

deepeval/metrics/plan_quality/plan_quality.py ADDED Viewed

@@ -0,0 +1,292 @@
+from typing import Optional, List, Union, Dict
+from deepeval.utils import get_or_create_event_loop, prettify_list
+from deepeval.metrics.utils import (
+    construct_verbose_logs,
+    trimAndLoadJson,
+    check_llm_test_case_params,
+    initialize_model,
+)
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.metrics import BaseMetric
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.step_efficiency.template import (
+    StepEfficiencyTemplate,
+)
+from deepeval.metrics.step_efficiency.schema import Task
+from deepeval.metrics.plan_quality.schema import (
+    AgentPlan,
+    PlanQualityScore,
+)
+from deepeval.metrics.plan_quality.template import (
+    PlanQualityTemplate,
+)
+from deepeval.metrics.plan_adherence.template import (
+    PlanAdherenceTemplate,
+)
+from deepeval.metrics.api import metric_data_manager
+class PlanQualityMetric(BaseMetric):
+    _required_params: List[LLMTestCaseParams] = [
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.TOOLS_CALLED,
+    ]
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: Optional[Union[str, DeepEvalBaseLLM]] = None,
+        include_reason: bool = True,
+        async_mode: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+    ):
+        self.threshold = 1 if strict_mode else threshold
+        self.model, self.using_native_model = initialize_model(model)
+        self.evaluation_model = self.model.get_model_name()
+        self.include_reason = include_reason
+        self.async_mode = async_mode
+        self.strict_mode = strict_mode
+        self.verbose_mode = verbose_mode
+        self.requires_trace = True
+    def measure(
+        self,
+        test_case: LLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
+    ):
+        has_trace: bool = isinstance(test_case._trace_dict, Dict)
+        if not has_trace:
+            check_llm_test_case_params(test_case, self._required_params, self)
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self, _show_indicator=_show_indicator, _in_component=_in_component
+        ):
+            if self.async_mode:
+                loop = get_or_create_event_loop()
+                loop.run_until_complete(
+                    self.a_measure(
+                        test_case,
+                        _show_indicator=False,
+                        _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
+                    )
+                )
+            else:
+                task = self._extract_task_from_trace(test_case)
+                agent_plan = self._extract_plan_from_trace(test_case)
+                if len(agent_plan.plan) == 0:
+                    self.score = 1
+                    self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
+                else:
+                    plan_quality_score = self._get_plan_quality_score(
+                        task, agent_plan.plan
+                    )
+                    self.score = (
+                        0
+                        if self.strict_mode
+                        and plan_quality_score.score < self.threshold
+                        else plan_quality_score.score
+                    )
+                    self.reason = plan_quality_score.reason
+                self.success = self.score >= self.threshold
+                self.verbose_logs = construct_verbose_logs(
+                    self,
+                    steps=[
+                        f"Task: {task} \n",
+                        f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
+                        f"Final Score Score: {self.score} \n",
+                        f"Final Score Reason: {self.reason} \n",
+                    ],
+                )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
+                return self.score
+    async def a_measure(
+        self,
+        test_case: LLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
+    ):
+        has_trace: bool = isinstance(test_case._trace_dict, Dict)
+        if not has_trace:
+            check_llm_test_case_params(test_case, self._required_params, self)
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self,
+            async_mode=True,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
+        ):
+            task = await self._a_extract_task_from_trace(test_case)
+            agent_plan = await self._a_extract_plan_from_trace(test_case)
+            if len(agent_plan.plan) == 0:
+                self.score = 1
+                self.reason = "There are no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in the trace attributes."
+            else:
+                plan_quality_score = await self._a_get_plan_quality_score(
+                    task, agent_plan.plan
+                )
+                self.score = (
+                    0
+                    if self.strict_mode
+                    and plan_quality_score.score < self.threshold
+                    else plan_quality_score.score
+                )
+                self.reason = plan_quality_score.reason
+            self.success = self.score >= self.threshold
+            self.verbose_logs = construct_verbose_logs(
+                self,
+                steps=[
+                    f"Task: {task} \n",
+                    f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
+                    f"Final Score: {self.score} \n",
+                    f"Final Reason: {self.reason} \n",
+                ],
+            )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
+            return self.score
+    def _get_plan_quality_score(self, task, plan):
+        prompt = PlanQualityTemplate.evaluate_plan_quality(
+            task, "\n".join(plan)
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt, schema=PlanQualityScore)
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res: Task = self.model.generate(prompt, schema=PlanQualityScore)
+                return res
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return PlanQualityScore(**data)
+    async def _a_get_plan_quality_score(self, task, plan):
+        prompt = PlanQualityTemplate.evaluate_plan_quality(
+            task, "\n".join(plan)
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(
+                prompt, schema=PlanQualityScore
+            )
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res: Task = await self.model.a_generate(
+                    prompt, schema=PlanQualityScore
+                )
+                return res
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return PlanQualityScore(**data)
+    def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
+        prompt = PlanAdherenceTemplate.extract_plan_from_trace(
+            test_case._trace_dict
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt, schema=AgentPlan)
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res: Task = self.model.generate(prompt, schema=AgentPlan)
+                return res
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return AgentPlan(**data)
+    async def _a_extract_plan_from_trace(
+        self, test_case: LLMTestCase
+    ) -> AgentPlan:
+        prompt = PlanAdherenceTemplate.extract_plan_from_trace(
+            test_case._trace_dict
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res: Task = await self.model.a_generate(
+                    prompt, schema=AgentPlan
+                )
+                return res
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return AgentPlan(**data)
+    def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
+        prompt = StepEfficiencyTemplate.extract_task_from_trace(
+            test_case._trace_dict
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt, schema=Task)
+            self.evaluation_cost += cost
+            return res.task
+        else:
+            try:
+                res: Task = self.model.generate(prompt, schema=Task)
+                return res.task
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["task"]
+    async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
+        prompt = StepEfficiencyTemplate.extract_task_from_trace(
+            test_case._trace_dict
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(prompt, schema=Task)
+            self.evaluation_cost += cost
+            return res.task
+        else:
+            try:
+                res: Task = await self.model.a_generate(prompt, schema=Task)
+                return res.task
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["task"]
+    def is_successful(self) -> bool:
+        if self.error is not None:
+            self.success = False
+        else:
+            try:
+                self.success = self.score >= self.threshold
+            except:
+                self.success = False
+        return self.success
+    @property
+    def __name__(self):
+        return "Plan Quality"

deepeval/metrics/plan_quality/schema.py ADDED Viewed

@@ -0,0 +1,11 @@
+from pydantic import BaseModel
+from typing import List, Dict, Literal
+class AgentPlan(BaseModel):
+    plan: List[str]
+class PlanQualityScore(BaseModel):
+    score: float
+    reason: str

deepeval/metrics/plan_quality/template.py ADDED Viewed

@@ -0,0 +1,101 @@
+import textwrap
+import json
+from deepeval.tracing.utils import make_json_serializable
+class PlanQualityTemplate:
+    @staticmethod
+    def evaluate_plan_quality(user_task: str, agent_plan: list) -> str:
+        return textwrap.dedent(
+            f"""You are a **plan quality evaluator**. Your task is to critically assess the **quality, completeness, and optimality** of an AI agent's plan to accomplish the given user task.
+                INPUTS:
+                - **User Task:** The user's explicit goal or instruction.
+                - **Agent Plan:** The ordered list of steps the agent intends to follow to achieve that goal.
+                EVALUATION OBJECTIVE:
+                Judge the **intrinsic quality** of the plan — whether the plan itself is strong enough to fully and efficiently achieve the user's task.
+                The evaluation must be **strict**.  If the plan is incomplete, inefficient, redundant, or missing critical details, assign a very low score.
+                STRICT EVALUATION CRITERIA:
+                1. Completeness (Most Important)
+                - The plan must fully address all major requirements of the user task.
+                - Missing even one critical subtask or dependency should reduce the score sharply.
+                - The plan must include all prerequisite actions necessary for the final outcome.
+                2. Logical Coherence
+                - Steps must follow a clear, rational sequence that leads directly to completing the task.
+                - Disordered, redundant, or circular reasoning should be penalized heavily.
+                - Every step must have a clear purpose; no filler or irrelevant actions.
+                3. Optimality and Efficiency
+                - The plan must be **minimal but sufficient** — no unnecessary or repetitive steps.
+                - If a more direct, simpler, or logically superior plan could achieve the same outcome, the current plan should receive a lower score.
+                4. Level of Detail
+                - Each step should be specific enough for an agent to execute it reliably without ambiguity.
+                - Vague steps (e.g., “Do research”, “Handle results”) that lack operational clarity
+                    lower the score.
+                5. Alignment with Task
+                - The plan must explicitly and directly target the user's stated goal.
+                - If any step diverges from the main objective, the score should drop significantly.
+                ---
+                SCORING SCALE (STRICT)
+                - **1.0 — Excellent plan**
+                - Fully complete, logically ordered, and optimally efficient.
+                - No missing, redundant, or ambiguous steps.
+                - Directly fulfills every aspect of the user task.
+                - **0.75 — Good plan**
+                - Covers nearly all aspects of the task with clear logic.
+                - Minor gaps or small inefficiencies that do not block task completion.
+                - **0.5 — Adequate but flawed plan**
+                - Partially complete; key details missing or step order inefficient.
+                - Some ambiguity or redundancy that would likely affect execution success.
+                - **0.25 — Weak plan**
+                - Major missing steps or unclear logic.
+                - The plan would likely fail to complete the task as written.
+                - **0.0 — Inadequate plan**
+                - Irrelevant, incoherent, or severely incomplete plan.
+                - Does not align with the user’s task or cannot plausibly achieve it.
+                *When in doubt, assign the lower score.*
+                OUTPUT FORMAT:
+                Return a JSON object with this exact structure:
+                {{
+                    "score": 0.0,
+                    "reason": "1-3 short, precise sentences explaining what the plan lacks or how it could fail."
+                }}
+                The `"reason"` must:
+                - Reference specific missing, unclear, or inefficient steps.
+                - Avoid vague language (“seems fine”, “mostly works”).
+                - Use objective terms describing gaps or weaknesses.
+                PROVIDED DATA
+                User Task:
+                {user_task}
+                Agent Plan:
+                {agent_plan}
+                JSON:
+            """
+        )

deepeval/metrics/prompt_alignment/prompt_alignment.py CHANGED Viewed

@@ -20,6 +20,8 @@ from deepeval.metrics.indicator import metric_progress_indicator
 from deepeval.metrics.prompt_alignment import schema as paschema
 from deepeval.config.settings import get_settings
+from deepeval.metrics.api import metric_data_manager
 class PromptAlignmentMetric(BaseMetric):
@@ -55,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -93,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
@@ -101,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -128,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score
     async def _a_generate_reason(self, input: str, actual_output: str) -> str:

deepeval/metrics/role_adherence/role_adherence.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Optional, Union, List
 from deepeval.metrics import BaseConversationalMetric
+from deepeval.metrics.api import metric_data_manager
 from deepeval.metrics.role_adherence.schema import (
     OutOfCharacterResponseVerdicts,
 )
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
         test_case: ConversationalTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ):
         check_conversational_test_case_params(
             test_case,
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
     async def a_measure(
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
         test_case: ConversationalTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_conversational_test_case_params(
             test_case,
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score
     async def _a_generate_reason(self, role: str) -> str:

deepeval/metrics/role_violation/role_violation.py CHANGED Viewed

@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
 )
 from deepeval.metrics.role_violation.template import RoleViolationTemplate
 from deepeval.metrics.role_violation.schema import *
+from deepeval.metrics.api import metric_data_manager
 class RoleViolationMetric(BaseMetric):
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
                         test_case,
                         _show_indicator=False,
                         _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
                     )
                 )
             else:
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
                         f"Score: {self.score}\nReason: {self.reason}",
                     ],
                 )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
             return self.score
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
             return self.score

deepeval/metrics/step_efficiency/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .step_efficiency import StepEfficiencyMetric

deepeval/metrics/step_efficiency/schema.py ADDED Viewed

@@ -0,0 +1,11 @@
+from pydantic import BaseModel
+from typing import List, Dict, Literal
+class Task(BaseModel):
+    task: str
+class EfficiencyVerdict(BaseModel):
+    score: float
+    reason: str

deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl