PyPI - deepeval - Versions diffs - 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl - Mend

deepeval 3.7.2py3-none-any.whl → 3.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/human_eval/human_eval.py +2 -1
deepeval/cli/test.py +1 -1
deepeval/config/settings.py +102 -13
deepeval/dataset/dataset.py +35 -11
deepeval/dataset/utils.py +2 -0
deepeval/evaluate/configs.py +1 -1
deepeval/evaluate/execute.py +4 -1
deepeval/metrics/answer_relevancy/template.py +4 -4
deepeval/metrics/argument_correctness/template.py +2 -2
deepeval/metrics/bias/template.py +3 -3
deepeval/metrics/contextual_precision/template.py +6 -6
deepeval/metrics/contextual_recall/template.py +2 -2
deepeval/metrics/contextual_relevancy/template.py +3 -3
deepeval/metrics/conversation_completeness/template.py +2 -2
deepeval/metrics/conversational_dag/templates.py +4 -4
deepeval/metrics/conversational_g_eval/template.py +4 -3
deepeval/metrics/dag/templates.py +4 -4
deepeval/metrics/faithfulness/template.py +4 -4
deepeval/metrics/hallucination/template.py +4 -4
deepeval/metrics/misuse/template.py +2 -2
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
deepeval/metrics/non_advice/template.py +2 -2
deepeval/metrics/pii_leakage/template.py +2 -2
deepeval/metrics/prompt_alignment/template.py +4 -4
deepeval/metrics/role_violation/template.py +2 -2
deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
deepeval/metrics/toxicity/template.py +4 -4
deepeval/metrics/turn_relevancy/template.py +2 -2
deepeval/metrics/utils.py +3 -0
deepeval/models/__init__.py +2 -0
deepeval/models/embedding_models/azure_embedding_model.py +28 -15
deepeval/models/embedding_models/local_embedding_model.py +23 -10
deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
deepeval/models/embedding_models/openai_embedding_model.py +18 -2
deepeval/models/llms/anthropic_model.py +17 -5
deepeval/models/llms/azure_model.py +30 -18
deepeval/models/llms/deepseek_model.py +22 -12
deepeval/models/llms/gemini_model.py +120 -87
deepeval/models/llms/grok_model.py +23 -16
deepeval/models/llms/kimi_model.py +23 -12
deepeval/models/llms/litellm_model.py +63 -25
deepeval/models/llms/local_model.py +26 -18
deepeval/models/llms/ollama_model.py +17 -7
deepeval/models/llms/openai_model.py +22 -17
deepeval/models/llms/portkey_model.py +132 -0
deepeval/models/mlllms/__init__.py +1 -0
deepeval/models/mlllms/azure_model.py +343 -0
deepeval/models/mlllms/gemini_model.py +102 -73
deepeval/models/mlllms/ollama_model.py +40 -9
deepeval/models/mlllms/openai_model.py +65 -14
deepeval/models/utils.py +48 -3
deepeval/optimization/__init__.py +13 -0
deepeval/optimization/adapters/__init__.py +2 -0
deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
deepeval/optimization/aggregates.py +14 -0
deepeval/optimization/configs.py +34 -0
deepeval/optimization/copro/configs.py +31 -0
deepeval/optimization/copro/loop.py +837 -0
deepeval/optimization/gepa/__init__.py +7 -0
deepeval/optimization/gepa/configs.py +115 -0
deepeval/optimization/gepa/loop.py +677 -0
deepeval/optimization/miprov2/configs.py +134 -0
deepeval/optimization/miprov2/loop.py +785 -0
deepeval/optimization/mutations/__init__.py +0 -0
deepeval/optimization/mutations/prompt_rewriter.py +458 -0
deepeval/optimization/policies/__init__.py +16 -0
deepeval/optimization/policies/selection.py +166 -0
deepeval/optimization/policies/tie_breaker.py +67 -0
deepeval/optimization/prompt_optimizer.py +462 -0
deepeval/optimization/simba/__init__.py +0 -0
deepeval/optimization/simba/configs.py +33 -0
deepeval/optimization/simba/loop.py +983 -0
deepeval/optimization/simba/types.py +15 -0
deepeval/optimization/types.py +361 -0
deepeval/optimization/utils.py +598 -0
deepeval/prompt/prompt.py +10 -5
deepeval/test_run/cache.py +2 -0
deepeval/test_run/test_run.py +6 -1
deepeval/tracing/context.py +3 -0
deepeval/tracing/tracing.py +22 -11
deepeval/utils.py +24 -0
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0

deepeval/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "3.7.2"
1	+ __version__: str = "3.7.4"

deepeval/benchmarks/human_eval/human_eval.py CHANGED Viewed

@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
         self.predictions: Optional[pd.DataFrame] = None
         self.task_scores: Optional[pd.DataFrame] = None
         self.overall_score: Optional[float] = None
-        self.verbose_mode: bool = (False,)
+        self.verbose_mode: bool = verbose_mode
     def evaluate(
         self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
                         task.value,
                         golden.input,
                         prediction,
+                        task_correct,
                         golden.expected_output,
                         score,
                     )

deepeval/cli/test.py CHANGED Viewed

@@ -160,7 +160,7 @@ def run(
         pytest_args.extend(["--identifier", identifier])
     # Add the deepeval plugin file to pytest arguments
-    pytest_args.extend(["-p", "plugins"])
+    pytest_args.extend(["-p", "deepeval"])
     # Append the extra arguments collected by allow_extra_args=True
     # Pytest will raise its own error if the arguments are invalid (error:
     if ctx.args:

deepeval/config/settings.py CHANGED Viewed

@@ -49,6 +49,8 @@ _DEPRECATED_TO_OVERRIDE = {
     "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
     "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
 }
+# Track which secrets we've warned about when loading from the legacy keyfile
+_LEGACY_KEYFILE_SECRET_WARNED: set[str] = set()
 def _find_legacy_enum(env_key: str):
@@ -88,6 +90,82 @@ def _is_secret_key(settings: "Settings", env_key: str) -> bool:
     return False
+def _merge_legacy_keyfile_into_env() -> None:
+    """
+    Backwards compatibility: merge values from the legacy .deepeval/.deepeval
+    JSON keystore into os.environ for known Settings fields, without
+    overwriting existing process env vars.
+    This runs before we compute the Settings env fingerprint so that Pydantic
+    can see these values on first construction.
+    Precedence: process env -> dotenv -> legacy json
+    """
+    # if somebody really wants to skip this behavior
+    if parse_bool(os.getenv("DEEPEVAL_DISABLE_LEGACY_KEYFILE"), default=False):
+        return
+    from deepeval.constants import HIDDEN_DIR, KEY_FILE
+    from deepeval.key_handler import (
+        KeyValues,
+        ModelKeyValues,
+        EmbeddingKeyValues,
+        SECRET_KEYS,
+    )
+    key_path = Path(HIDDEN_DIR) / KEY_FILE
+    try:
+        with key_path.open("r", encoding="utf-8") as f:
+            try:
+                data = json.load(f)
+            except json.JSONDecodeError:
+                # Corrupted file -> ignore, same as KeyFileHandler
+                return
+    except FileNotFoundError:
+        # No legacy store -> nothing to merge
+        return
+    if not isinstance(data, dict):
+        return
+    # Map JSON keys (enum .value) -> env keys (enum .name)
+    mapping: Dict[str, str] = {}
+    for enum in (KeyValues, ModelKeyValues, EmbeddingKeyValues):
+        for member in enum:
+            mapping[member.value] = member.name
+    for json_key, raw in data.items():
+        env_key = mapping.get(json_key)
+        if not env_key:
+            continue
+        # Process env always wins
+        if env_key in os.environ:
+            continue
+        if raw is None:
+            continue
+        # Mirror the legacy warning semantics for secrets, but only once per key
+        if (
+            json_key in SECRET_KEYS
+            and json_key not in _LEGACY_KEYFILE_SECRET_WARNED
+        ):
+            logger.warning(
+                "Reading secret '%s' from legacy %s/%s. "
+                "Persisting API keys in plaintext is deprecated. "
+                "Move this to your environment (.env / .env.local). "
+                "This fallback will be removed in a future release.",
+                json_key,
+                HIDDEN_DIR,
+                KEY_FILE,
+            )
+            _LEGACY_KEYFILE_SECRET_WARNED.add(json_key)
+        # Let Settings validators coerce types; we just inject the raw string
+        os.environ[env_key] = str(raw)
 def _read_env_file(path: Path) -> Dict[str, str]:
     if not path.exists():
         return {}
@@ -258,6 +336,7 @@ class Settings(BaseSettings):
     GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
     GOOGLE_CLOUD_PROJECT: Optional[str] = None
     GOOGLE_CLOUD_LOCATION: Optional[str] = None
+    GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
     # Grok
     USE_GROK_MODEL: Optional[bool] = None
     GROK_API_KEY: Optional[SecretStr] = None
@@ -291,6 +370,12 @@ class Settings(BaseSettings):
     OPENAI_MODEL_NAME: Optional[str] = None
     OPENAI_COST_PER_INPUT_TOKEN: Optional[float] = None
     OPENAI_COST_PER_OUTPUT_TOKEN: Optional[float] = None
+    # PortKey
+    USE_PORTKEY_MODEL: Optional[bool] = None
+    PORTKEY_API_KEY: Optional[SecretStr] = None
+    PORTKEY_MODEL_NAME: Optional[str] = None
+    PORTKEY_BASE_URL: Optional[AnyUrl] = None
+    PORTKEY_PROVIDER_NAME: Optional[str] = None
     # Vertex AI
     VERTEX_AI_MODEL_NAME: Optional[str] = None
     # VLLM
@@ -516,29 +601,30 @@ class Settings(BaseSettings):
         "CONFIDENT_OPEN_BROWSER",
         "CONFIDENT_TRACE_FLUSH",
         "CONFIDENT_TRACE_VERBOSE",
+        "CUDA_LAUNCH_BLOCKING",
+        "DEEPEVAL_VERBOSE_MODE",
+        "DEEPEVAL_GRPC_LOGGING",
+        "DEEPEVAL_DISABLE_DOTENV",
+        "DEEPEVAL_TELEMETRY_OPT_OUT",
+        "DEEPEVAL_UPDATE_WARNING_OPT_IN",
+        "ENABLE_DEEPEVAL_CACHE",
+        "ERROR_REPORTING",
+        "GOOGLE_GENAI_USE_VERTEXAI",
+        "IGNORE_DEEPEVAL_ERRORS",
+        "SKIP_DEEPEVAL_MISSING_PARAMS",
+        "TOKENIZERS_PARALLELISM",
+        "TRANSFORMERS_NO_ADVISORY_WARNINGS",
         "USE_OPENAI_MODEL",
         "USE_AZURE_OPENAI",
         "USE_LOCAL_MODEL",
         "USE_GEMINI_MODEL",
-        "GOOGLE_GENAI_USE_VERTEXAI",
         "USE_MOONSHOT_MODEL",
         "USE_GROK_MODEL",
         "USE_DEEPSEEK_MODEL",
         "USE_LITELLM",
         "USE_AZURE_OPENAI_EMBEDDING",
         "USE_LOCAL_EMBEDDINGS",
-        "DEEPEVAL_GRPC_LOGGING",
-        "DEEPEVAL_DISABLE_DOTENV",
-        "DEEPEVAL_TELEMETRY_OPT_OUT",
-        "DEEPEVAL_UPDATE_WARNING_OPT_IN",
-        "TOKENIZERS_PARALLELISM",
-        "TRANSFORMERS_NO_ADVISORY_WARNINGS",
-        "CUDA_LAUNCH_BLOCKING",
-        "ERROR_REPORTING",
-        "IGNORE_DEEPEVAL_ERRORS",
-        "SKIP_DEEPEVAL_MISSING_PARAMS",
-        "DEEPEVAL_VERBOSE_MODE",
-        "ENABLE_DEEPEVAL_CACHE",
+        "USE_PORTKEY_MODEL",
         mode="before",
     )
     @classmethod
@@ -1008,6 +1094,9 @@ _settings_lock = threading.RLock()
 def _calc_env_fingerprint() -> str:
+    # Pull legacy .deepeval JSON-based settings into the process env before hashing
+    _merge_legacy_keyfile_into_env()
     env = os.environ.copy()
     # must hash in a stable order.
     keys = sorted(

deepeval/dataset/dataset.py CHANGED Viewed

@@ -189,17 +189,35 @@ class EvaluationDataset:
         test_case._dataset_alias = self._alias
         test_case._dataset_id = self._id
         if isinstance(test_case, LLMTestCase):
+            if self._conversational_goldens or self._conversational_test_cases:
+                raise TypeError(
+                    "You cannot add 'LLMTestCase' to a multi-turn dataset."
+                )
             test_case._dataset_rank = len(self._llm_test_cases)
             self._llm_test_cases.append(test_case)
         elif isinstance(test_case, ConversationalTestCase):
+            if self._goldens or self._llm_test_cases:
+                raise TypeError(
+                    "You cannot add 'ConversationalTestCase' to a single-turn dataset."
+                )
+            self._multi_turn = True
             test_case._dataset_rank = len(self._conversational_test_cases)
             self._conversational_test_cases.append(test_case)
     def add_golden(self, golden: Union[Golden, ConversationalGolden]):
-        if self._multi_turn:
-            self._add_conversational_golden(golden)
-        else:
+        if isinstance(golden, Golden):
+            if self._conversational_goldens or self._conversational_test_cases:
+                raise TypeError(
+                    "You cannot add 'Golden' to a multi-turn dataset."
+                )
             self._add_golden(golden)
+        else:
+            if self._goldens or self._llm_test_cases:
+                raise TypeError(
+                    "You cannot add 'ConversationalGolden' to a single-turn dataset."
+                )
+            self._multi_turn = True
+            self._add_conversational_golden(golden)
     def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
         if isinstance(golden, Golden):
@@ -224,16 +242,16 @@ class EvaluationDataset:
         file_path: str,
         input_col_name: str,
         actual_output_col_name: str,
-        expected_output_col_name: Optional[str] = None,
-        context_col_name: Optional[str] = None,
+        expected_output_col_name: Optional[str] = "expected_output",
+        context_col_name: Optional[str] = "context",
         context_col_delimiter: str = ";",
-        retrieval_context_col_name: Optional[str] = None,
+        retrieval_context_col_name: Optional[str] = "retrieval_context",
         retrieval_context_col_delimiter: str = ";",
-        tools_called_col_name: Optional[str] = None,
+        tools_called_col_name: Optional[str] = "tools_called",
         tools_called_col_delimiter: str = ";",
-        expected_tools_col_name: Optional[str] = None,
+        expected_tools_col_name: Optional[str] = "expected_tools",
         expected_tools_col_delimiter: str = ";",
-        additional_metadata_col_name: Optional[str] = None,
+        additional_metadata_col_name: Optional[str] = "additional_metadata",
     ):
         """
         Load test cases from a CSV file.
@@ -379,6 +397,7 @@ class EvaluationDataset:
         retrieval_context_key_name: Optional[str] = None,
         tools_called_key_name: Optional[str] = None,
         expected_tools_key_name: Optional[str] = None,
+        addtional_metadata_key_name: Optional[str] = None,
         encoding_type: str = "utf-8",
     ):
         """
@@ -431,6 +450,7 @@ class EvaluationDataset:
             tools_called = [ToolCall(**tool) for tool in tools_called_data]
             expected_tools_data = json_obj.get(expected_tools_key_name, [])
             expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
+            # additional_metadata = json_obj.get(addtional_metadata_key_name)
             self.add_test_case(
                 LLMTestCase(
@@ -441,6 +461,7 @@ class EvaluationDataset:
                     retrieval_context=retrieval_context,
                     tools_called=tools_called,
                     expected_tools=expected_tools,
+                    # additional_metadata=additional_metadata,
                 )
             )
@@ -460,8 +481,8 @@ class EvaluationDataset:
         expected_tools_col_delimiter: str = ";",
         comments_key_name: str = "comments",
         name_key_name: str = "name",
-        source_file_col_name: Optional[str] = None,
-        additional_metadata_col_name: Optional[str] = None,
+        source_file_col_name: Optional[str] = "source_file",
+        additional_metadata_col_name: Optional[str] = "additional_metadata",
         scenario_col_name: Optional[str] = "scenario",
         turns_col_name: Optional[str] = "turns",
         expected_outcome_col_name: Optional[str] = "expected_outcome",
@@ -587,6 +608,7 @@ class EvaluationDataset:
                         context=context,
                         comments=comments,
                         name=name,
+                        additional_metadata=additional_metadata,
                     )
                 )
             else:
@@ -645,6 +667,7 @@ class EvaluationDataset:
                 comments = json_obj.get(comments_key_name)
                 name = json_obj.get(name_key_name)
                 parsed_turns = parse_turns(turns) if turns else []
+                additional_metadata = json_obj.get(additional_metadata_key_name)
                 self._multi_turn = True
                 self.goldens.append(
@@ -656,6 +679,7 @@ class EvaluationDataset:
                         context=context,
                         comments=comments,
                         name=name,
+                        additional_metadata=additional_metadata,
                     )
                 )
             else:

deepeval/dataset/utils.py CHANGED Viewed

@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
             "retrieval_context": test_case.retrieval_context,
             "tools_called": test_case.tools_called,
             "expected_tools": test_case.expected_tools,
+            "additional_metadata": test_case.additional_metadata,
         }
         goldens.append(Golden(**golden))
     return goldens
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
             "expected_outcome": test_case.expected_outcome,
             "user_description": test_case.user_description,
             "context": test_case.context,
+            "additional_metadata": test_case.additional_metadata,
         }
         goldens.append(ConversationalGolden(**golden))
     return goldens

deepeval/evaluate/configs.py CHANGED Viewed

@@ -7,7 +7,7 @@ from deepeval.test_run.test_run import TestRunResultDisplay
 @dataclass
 class AsyncConfig:
     run_async: bool = True
-    throttle_value: int = 0
+    throttle_value: float = 0
     max_concurrent: int = 20
     def __post_init__(self):

deepeval/evaluate/execute.py CHANGED Viewed

@@ -718,6 +718,8 @@ async def a_execute_test_cases(
                     "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
                     _gather_timeout(),
                 )
+                if not error_config.ignore_errors:
+                    raise
     else:
         for test_case in test_cases:
@@ -803,7 +805,8 @@ async def a_execute_test_cases(
                 if not t.done():
                     t.cancel()
             await asyncio.gather(*tasks, return_exceptions=True)
-            raise
+            if not error_config.ignore_errors:
+                raise
     return test_results

deepeval/metrics/answer_relevancy/template.py CHANGED Viewed

@@ -49,12 +49,12 @@ Expected JSON format:
             "verdict": "yes"
         }},
         {{
-            "verdict": "no",
-            "reason": <explanation_for_irrelevance>
+            "reason": <explanation_for_irrelevance>,
+            "verdict": "no"
         }},
         {{
-            "verdict": "idk",
-            "reason": <explanation_for_ambiguity>
+            "reason": <explanation_for_ambiguity>,
+            "verdict": "idk"
         }}
     ]
 }}

deepeval/metrics/argument_correctness/template.py CHANGED Viewed

@@ -70,8 +70,8 @@ class ArgumentCorrectnessTemplate:
                         "verdict": "yes"
                     }},
                     {{
-                        "verdict": "no",
-                        "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023."
+                        "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023.",
+                        "verdict": "no"
                     }}
                 ]
             }}

deepeval/metrics/bias/template.py CHANGED Viewed

@@ -64,15 +64,15 @@ Example JSON:
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
-            "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement."
+            "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement.",
+            "verdict": "yes"
         }},
         {{
             "verdict": "no"
         }},
         {{
             "verdict": "no"
-        }},
+        }}
     ]
 }}

deepeval/metrics/contextual_precision/template.py CHANGED Viewed

@@ -19,16 +19,16 @@ Example:
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
-            "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'"
+            "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
+            "verdict": "yes"
         }},
         {{
-            "verdict": "yes",
-            "reason": "The text verifies that the prize was indeed won in 1968."
+            "reason": "The text verifies that the prize was indeed won in 1968.",
+            "verdict": "yes"
         }},
         {{
-            "verdict": "no",
-            "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize."
+            "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
+            "verdict": "no"
         }}
     ]
 }}

deepeval/metrics/contextual_recall/template.py CHANGED Viewed

@@ -55,8 +55,8 @@ IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' k
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
-            "reason": "..."
+            "reason": "...",
+            "verdict": "yes"
         }},
         ...
     ]

deepeval/metrics/contextual_relevancy/template.py CHANGED Viewed

@@ -55,13 +55,13 @@ Example:
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
             "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
+            "verdict": "yes"
         }},
         {{
-            "verdict": "no",
             "statement": "There was a cat.",
-            "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
+            "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
+            "verdict": "no"
         }}
     ]
 }}

deepeval/metrics/conversation_completeness/template.py CHANGED Viewed

@@ -86,8 +86,8 @@ User wants to tell the assistant something.
 Example JSON:
 {{
-    "verdict": "no",
-    "reason": "The user wanted to tell the assistant something but the LLM not only refused to answer but replied 'Oh ok, in that case should you need anything just let me know!', which is completely irrelevant and doesn't satisfy the user at all. "
+    "reason": "The user wanted to tell the assistant something but the LLM not only refused to answer but replied 'Oh ok, in that case should you need anything just let me know!', which is completely irrelevant and doesn't satisfy the user at all.",
+    "verdict": "no"
 }}
 ===== END OF EXAMPLE ======

deepeval/metrics/conversational_dag/templates.py CHANGED Viewed

@@ -77,8 +77,8 @@ class ConversationalBinaryJudgementTemplate:
                 Example:
                 {{
-                "verdict": true,
-                "reason": "The assistant provided a clear and direct answer in response to every user query."
+                "reason": "The assistant provided a clear and direct answer in response to every user query.",
+                "verdict": true
                 }}
                 **
                 JSON:
@@ -108,8 +108,8 @@ class ConversationalNonBinaryJudgementTemplate:
                 Example:
                 {{
-                "verdict": "{options[1]}",
-                "reason": "The assistant partially addressed the user’s issue but missed clarifying their follow-up question."
+                "reason": "The assistant partially addressed the user's issue but missed clarifying their follow-up question.",
+                "verdict": "{options[1]}"
                 }}
                 **
                 JSON:

deepeval/metrics/conversational_g_eval/template.py CHANGED Viewed

@@ -70,7 +70,8 @@ JSON:
     ---
     Example JSON:
     {{
-    "score": 0,
-    "reason": "Your concise and informative reason here."
+        "reason": "Your concise and informative reason here.",
+        "score": 0
     }}
-    """
+    JSON:"""

deepeval/metrics/dag/templates.py CHANGED Viewed

@@ -63,8 +63,8 @@ class BinaryJudgementTemplate:
 IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
 Example JSON:
 {{
-    "verdict": True,
-    "reason": "..."
+    "reason": "...",
+    "verdict": True
 }}
 **
@@ -85,8 +85,8 @@ class NonBinaryJudgementTemplate:
 IMPORTANT: Please make sure to only return a json with two keys: 'verdict' {options} and 'reason' providing the reason.
 Example JSON:
 {{
-    "verdict": {options},
-    "reason": "..."
+    "reason": "...",
+    "verdict": {options}
 }}
 **

deepeval/metrics/faithfulness/template.py CHANGED Viewed

@@ -83,12 +83,12 @@ Expected JSON format:
             "verdict": "yes"
         }},
         {{
-            "verdict": "no",
-            "reason": <explanation_for_contradiction>
+            "reason": <explanation_for_contradiction>,
+            "verdict": "no"
         }},
         {{
-            "verdict": "idk",
-            "reason": <explanation_for_uncertainty>
+            "reason": <explanation_for_uncertainty>,
+            "verdict": "idk"
         }}
     ]
 }}

deepeval/metrics/hallucination/template.py CHANGED Viewed

@@ -17,12 +17,12 @@ Example:
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
-            "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
+            "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
+            "verdict": "yes"
         }},
         {{
-            "verdict": "no",
-            "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
+            "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969.",
+            "verdict": "no"
         }}
     ]
 }}

deepeval/metrics/misuse/template.py CHANGED Viewed

@@ -40,8 +40,8 @@ Example JSON:
 {{
     "verdicts": [
         {{
-            "verdict": "yes",
-            "reason": "This request falls outside the {domain} domain and should be handled by a different specialist."
+            "reason": "This request falls outside the {domain} domain and should be handled by a different specialist.",
+            "verdict": "yes"
         }},
         {{
             "verdict": "no"

deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py CHANGED Viewed

@@ -50,19 +50,19 @@ class MultimodalAnswerRelevancyTemplate:
                     {{
                         "verdicts": [
                             {{
-                                "verdict": "no",
-                                "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
+                                "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
+                                "verdict": "no"
                             }},
                             {{
-                                "verdict": "idk",
-                                "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
+                                "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
+                                "verdict": "idk"
                             }},
                             {{
-                                "verdict": "idk",
-                                "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
+                                "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
+                                "verdict": "idk"
                             }},
                             {{
-                                "verdict": "yes",
+                                "verdict": "yes"
                             }}
                         ]
                     }}

deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py CHANGED Viewed

@@ -27,16 +27,16 @@ class MultiModalContextualPrecisionTemplate:
                     {{
                         "verdicts": [
                             {{
-                                "verdict": "yes",
-                                "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'"
+                                "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
+                                "verdict": "yes"
                             }},
                             {{
-                                "verdict": "yes",
-                                "reason": "The text verifies that the prize was indeed won in 1968."
+                                "reason": "The text verifies that the prize was indeed won in 1968.",
+                                "verdict": "yes"
                             }},
                             {{
-                                "verdict": "no",
-                                "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize."
+                                "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
+                                "verdict": "no"
                             }}
                         ]
                     }}

deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py CHANGED Viewed

@@ -66,8 +66,8 @@ class MultimodalContextualRecallTemplate:
                     {{
                         "verdicts": [
                             {{
-                                "verdict": "yes",
-                                "reason": "..."
+                                "reason": "...",
+                                "verdict": "yes"
                             }},
                             ...
                         ]

deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl

deepeval 3.7.2py3-none-any.whl → 3.7.4py3-none-any.whl