PyPI - deepeval - Versions diffs - 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl - Mend

deepeval 3.7.4py3-none-any.whl → 3.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

deepeval/_version.py +1 -1
deepeval/dataset/golden.py +54 -2
deepeval/evaluate/evaluate.py +16 -8
deepeval/evaluate/execute.py +70 -26
deepeval/evaluate/utils.py +26 -22
deepeval/integrations/pydantic_ai/agent.py +19 -2
deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
deepeval/metrics/__init__.py +14 -12
deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
deepeval/metrics/answer_relevancy/template.py +188 -92
deepeval/metrics/base_metric.py +2 -5
deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
deepeval/metrics/contextual_precision/template.py +115 -66
deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
deepeval/metrics/contextual_recall/template.py +106 -55
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
deepeval/metrics/contextual_relevancy/template.py +87 -58
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +70 -27
deepeval/metrics/faithfulness/schema.py +1 -1
deepeval/metrics/faithfulness/template.py +200 -115
deepeval/metrics/g_eval/utils.py +2 -2
deepeval/metrics/indicator.py +4 -4
deepeval/metrics/multimodal_metrics/__init__.py +0 -18
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
deepeval/metrics/ragas.py +3 -3
deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
deepeval/metrics/turn_contextual_precision/schema.py +21 -0
deepeval/metrics/turn_contextual_precision/template.py +187 -0
deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
deepeval/metrics/turn_contextual_recall/schema.py +21 -0
deepeval/metrics/turn_contextual_recall/template.py +178 -0
deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
deepeval/metrics/turn_faithfulness/template.py +218 -0
deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
deepeval/metrics/utils.py +39 -58
deepeval/models/__init__.py +0 -12
deepeval/models/base_model.py +16 -38
deepeval/models/embedding_models/__init__.py +7 -0
deepeval/models/embedding_models/azure_embedding_model.py +52 -28
deepeval/models/embedding_models/local_embedding_model.py +18 -14
deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
deepeval/models/embedding_models/openai_embedding_model.py +40 -21
deepeval/models/llms/amazon_bedrock_model.py +1 -2
deepeval/models/llms/anthropic_model.py +44 -23
deepeval/models/llms/azure_model.py +121 -36
deepeval/models/llms/deepseek_model.py +18 -13
deepeval/models/llms/gemini_model.py +129 -43
deepeval/models/llms/grok_model.py +18 -13
deepeval/models/llms/kimi_model.py +18 -13
deepeval/models/llms/litellm_model.py +42 -22
deepeval/models/llms/local_model.py +12 -7
deepeval/models/llms/ollama_model.py +114 -12
deepeval/models/llms/openai_model.py +137 -41
deepeval/models/llms/portkey_model.py +24 -7
deepeval/models/llms/utils.py +5 -3
deepeval/models/retry_policy.py +17 -14
deepeval/models/utils.py +46 -1
deepeval/optimizer/__init__.py +5 -0
deepeval/optimizer/algorithms/__init__.py +6 -0
deepeval/optimizer/algorithms/base.py +29 -0
deepeval/optimizer/algorithms/configs.py +18 -0
deepeval/optimizer/algorithms/copro/__init__.py +5 -0
deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
deepeval/optimizer/algorithms/simba/__init__.py +5 -0
deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
deepeval/{optimization → optimizer}/configs.py +5 -8
deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
deepeval/optimizer/prompt_optimizer.py +263 -0
deepeval/optimizer/rewriter/__init__.py +5 -0
deepeval/optimizer/rewriter/rewriter.py +124 -0
deepeval/optimizer/rewriter/utils.py +214 -0
deepeval/optimizer/scorer/__init__.py +5 -0
deepeval/optimizer/scorer/base.py +86 -0
deepeval/optimizer/scorer/scorer.py +316 -0
deepeval/optimizer/scorer/utils.py +30 -0
deepeval/optimizer/types.py +148 -0
deepeval/{optimization → optimizer}/utils.py +47 -165
deepeval/prompt/prompt.py +5 -9
deepeval/test_case/__init__.py +1 -3
deepeval/test_case/api.py +12 -10
deepeval/test_case/conversational_test_case.py +19 -1
deepeval/test_case/llm_test_case.py +152 -1
deepeval/test_case/utils.py +4 -8
deepeval/test_run/api.py +15 -14
deepeval/test_run/test_run.py +3 -3
deepeval/tracing/patchers.py +9 -4
deepeval/tracing/tracing.py +2 -2
deepeval/utils.py +65 -0
{deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
{deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
deepeval/models/mlllms/__init__.py +0 -4
deepeval/models/mlllms/azure_model.py +0 -343
deepeval/models/mlllms/gemini_model.py +0 -313
deepeval/models/mlllms/ollama_model.py +0 -175
deepeval/models/mlllms/openai_model.py +0 -309
deepeval/optimization/__init__.py +0 -13
deepeval/optimization/adapters/__init__.py +0 -2
deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
deepeval/optimization/aggregates.py +0 -14
deepeval/optimization/copro/configs.py +0 -31
deepeval/optimization/gepa/__init__.py +0 -7
deepeval/optimization/gepa/configs.py +0 -115
deepeval/optimization/miprov2/configs.py +0 -134
deepeval/optimization/miprov2/loop.py +0 -785
deepeval/optimization/mutations/__init__.py +0 -0
deepeval/optimization/mutations/prompt_rewriter.py +0 -458
deepeval/optimization/policies/__init__.py +0 -16
deepeval/optimization/policies/tie_breaker.py +0 -67
deepeval/optimization/prompt_optimizer.py +0 -462
deepeval/optimization/simba/__init__.py +0 -0
deepeval/optimization/simba/configs.py +0 -33
deepeval/optimization/types.py +0 -361
deepeval/test_case/mllm_test_case.py +0 -170
/deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
/deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
/deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
/deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
/deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
{deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
{deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0

deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py RENAMED Viewed

@@ -12,5 +12,11 @@ class ContextualRelevancyVerdicts(BaseModel):
     verdicts: List[ContextualRelevancyVerdict]
-class MultimodelContextualRelevancyScoreReason(BaseModel):
+class ContextualRelevancyScoreReason(BaseModel):
     reason: str
+class InteractionContextualRelevancyScore(BaseModel):
+    score: float
+    reason: str
+    verdicts: List[ContextualRelevancyVerdict]

deepeval/metrics/turn_contextual_relevancy/template.py ADDED Viewed

@@ -0,0 +1,161 @@
+from typing import List, Union
+import textwrap
+from deepeval.test_case import MLLMImage
+class TurnContextualRelevancyTemplate:
+    multimodal_rules = """
+        --- MULTIMODAL INPUT RULES ---
+        - Treat image content as factual evidence.
+        - Only reference visual details that are explicitly and clearly visible.
+        - Do not infer or guess objects, text, or details not visibly present.
+        - If an image is unclear or ambiguous, mark uncertainty explicitly.
+        - When evaluating claims, compare them to BOTH textual and visual evidence.
+        - If the claim references something not clearly visible, respond with 'idk'.
+    """
+    @staticmethod
+    def generate_reason(
+        input: Union[str, List[Union[str, MLLMImage]]],
+        irrelevant_statements: List[str],
+        relevant_statements: List[str],
+        score: float,
+        multimodal: bool = False,
+    ):
+        # Note: irrelevancies parameter name in multimodal version is kept as irrelevant_statements for consistency
+        return textwrap.dedent(
+            f"""Based on the given user message, reasons for why the retrieval context is irrelevant to the user message, the statements in the retrieval context that is actually relevant to the retrieval context, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score.
+            In your reason, you should quote data provided in the reasons for irrelevancy and relevant statements to support your point.
+            {TurnContextualRelevancyTemplate.multimodal_rules if multimodal else ""}
+            **
+            IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+            Example JSON:
+            {{
+                "reason": "The score is <contextual_relevancy_score> because <your_reason>."
+            }}
+            If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
+            **
+            Contextual Relevancy Score:
+            {score}
+            User Message:
+            {input}
+            Reasons for why the retrieval context is irrelevant to the user message:
+            {irrelevant_statements}
+            Statement in the retrieval context that is relevant to the user message:
+            {relevant_statements}
+            JSON:
+            """
+        )
+    @staticmethod
+    def generate_verdicts(
+        input: Union[str, List[Union[str, MLLMImage]]],
+        context: Union[str, List[Union[str, MLLMImage]]],
+        multimodal: bool = False,
+    ):
+        context_type = "context (image or string)" if multimodal else "context"
+        statement_or_image = "statement or image" if multimodal else "statement"
+        # Conditional instructions based on mode
+        extraction_instructions = ""
+        if multimodal:
+            extraction_instructions = textwrap.dedent(
+                """
+                If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
+                If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
+                """
+            ).strip()
+        else:
+            extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."
+        # Additional instruction for empty context (only in non-multimodal)
+        empty_context_instruction = ""
+        if not multimodal:
+            empty_context_instruction = '\nIf provided context contains no actual content or statements then: give "no" as a "verdict",\nput context into "statement", and "No statements found in provided context." into "reason".'
+        return textwrap.dedent(
+            f"""Based on the user message and {context_type}, please generate a JSON object to indicate whether {'the context' if multimodal else 'each statement found in the context'} is relevant to the provided user message. The JSON will be a list of 'verdicts', with 2 mandatory fields: 'verdict' and 'statement', and 1 optional field: 'reason'.
+            {extraction_instructions}
+            The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the {statement_or_image} is relevant to the user message.
+            Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the {statement_or_image} to back up your reason.{empty_context_instruction}
+            {TurnContextualRelevancyTemplate.multimodal_rules if multimodal else ""}
+            **
+            IMPORTANT: Please make sure to only return in JSON format.
+            Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
+            Example User Message: "What were some of Einstein's achievements?"
+            Example:
+            {{
+                "verdicts": [
+                    {{
+                        "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
+                        "verdict": "yes"
+                    }},
+                    {{
+                        "statement": "There was a cat.",
+                        "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
+                        "verdict": "no"
+                    }}
+                ]
+            }}
+            **
+            User Message:
+            {input}
+            Context:
+            {context}
+            JSON:
+            """
+        )
+    @staticmethod
+    def generate_final_reason(
+        final_score: float, success: bool, reasons: List[str]
+    ):
+        return textwrap.dedent(
+            f"""You are an AI evaluator producing a single final explanation for the TurnContextualRelevancyMetric result.
+            Context:
+            This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
+            Inputs:
+            - final_score: the averaged score across all interactions.
+            - success: whether the metric passed or failed
+            - reasons: a list of textual reasons generated from individual interactions.
+            Instructions:
+            1. Read all reasons and synthesize them into one unified explanation.
+            2. Describe patterns of irrelevant statements, off-topic context, or well-targeted retrieval if present.
+            3. Do not repeat every reason; merge them into a concise, coherent narrative.
+            4. If the metric failed, state the dominant failure modes. If it passed, state why the retrieval context was relevant to user messages.
+            5. Output a single paragraph with no lists, no bullets, no markup.
+            Output:
+            A single paragraph explaining the final outcome.
+            Here's the inputs:
+            Final Score: {final_score}
+            Reasons:
+            {reasons}
+            Success: {success}
+            Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
+            The final reason:
+            """
+        )

deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

deepeval 3.7.4py3-none-any.whl → 3.7.5py3-none-any.whl