PyPI - deepeval - Versions diffs - 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl - Mend

deepeval 3.7.2py3-none-any.whl → 3.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/human_eval/human_eval.py +2 -1
deepeval/cli/test.py +1 -1
deepeval/config/settings.py +102 -13
deepeval/dataset/dataset.py +35 -11
deepeval/dataset/utils.py +2 -0
deepeval/evaluate/configs.py +1 -1
deepeval/evaluate/execute.py +4 -1
deepeval/metrics/answer_relevancy/template.py +4 -4
deepeval/metrics/argument_correctness/template.py +2 -2
deepeval/metrics/bias/template.py +3 -3
deepeval/metrics/contextual_precision/template.py +6 -6
deepeval/metrics/contextual_recall/template.py +2 -2
deepeval/metrics/contextual_relevancy/template.py +3 -3
deepeval/metrics/conversation_completeness/template.py +2 -2
deepeval/metrics/conversational_dag/templates.py +4 -4
deepeval/metrics/conversational_g_eval/template.py +4 -3
deepeval/metrics/dag/templates.py +4 -4
deepeval/metrics/faithfulness/template.py +4 -4
deepeval/metrics/hallucination/template.py +4 -4
deepeval/metrics/misuse/template.py +2 -2
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
deepeval/metrics/non_advice/template.py +2 -2
deepeval/metrics/pii_leakage/template.py +2 -2
deepeval/metrics/prompt_alignment/template.py +4 -4
deepeval/metrics/role_violation/template.py +2 -2
deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
deepeval/metrics/toxicity/template.py +4 -4
deepeval/metrics/turn_relevancy/template.py +2 -2
deepeval/metrics/utils.py +3 -0
deepeval/models/__init__.py +2 -0
deepeval/models/embedding_models/azure_embedding_model.py +28 -15
deepeval/models/embedding_models/local_embedding_model.py +23 -10
deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
deepeval/models/embedding_models/openai_embedding_model.py +18 -2
deepeval/models/llms/anthropic_model.py +17 -5
deepeval/models/llms/azure_model.py +30 -18
deepeval/models/llms/deepseek_model.py +22 -12
deepeval/models/llms/gemini_model.py +120 -87
deepeval/models/llms/grok_model.py +23 -16
deepeval/models/llms/kimi_model.py +23 -12
deepeval/models/llms/litellm_model.py +63 -25
deepeval/models/llms/local_model.py +26 -18
deepeval/models/llms/ollama_model.py +17 -7
deepeval/models/llms/openai_model.py +22 -17
deepeval/models/llms/portkey_model.py +132 -0
deepeval/models/mlllms/__init__.py +1 -0
deepeval/models/mlllms/azure_model.py +343 -0
deepeval/models/mlllms/gemini_model.py +102 -73
deepeval/models/mlllms/ollama_model.py +40 -9
deepeval/models/mlllms/openai_model.py +65 -14
deepeval/models/utils.py +48 -3
deepeval/optimization/__init__.py +13 -0
deepeval/optimization/adapters/__init__.py +2 -0
deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
deepeval/optimization/aggregates.py +14 -0
deepeval/optimization/configs.py +34 -0
deepeval/optimization/copro/configs.py +31 -0
deepeval/optimization/copro/loop.py +837 -0
deepeval/optimization/gepa/__init__.py +7 -0
deepeval/optimization/gepa/configs.py +115 -0
deepeval/optimization/gepa/loop.py +677 -0
deepeval/optimization/miprov2/configs.py +134 -0
deepeval/optimization/miprov2/loop.py +785 -0
deepeval/optimization/mutations/__init__.py +0 -0
deepeval/optimization/mutations/prompt_rewriter.py +458 -0
deepeval/optimization/policies/__init__.py +16 -0
deepeval/optimization/policies/selection.py +166 -0
deepeval/optimization/policies/tie_breaker.py +67 -0
deepeval/optimization/prompt_optimizer.py +462 -0
deepeval/optimization/simba/__init__.py +0 -0
deepeval/optimization/simba/configs.py +33 -0
deepeval/optimization/simba/loop.py +983 -0
deepeval/optimization/simba/types.py +15 -0
deepeval/optimization/types.py +361 -0
deepeval/optimization/utils.py +598 -0
deepeval/prompt/prompt.py +10 -5
deepeval/test_run/cache.py +2 -0
deepeval/test_run/test_run.py +6 -1
deepeval/tracing/context.py +3 -0
deepeval/tracing/tracing.py +22 -11
deepeval/utils.py +24 -0
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
{deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0

deepeval/optimization/utils.py ADDED Viewed

@@ -0,0 +1,598 @@
+from __future__ import annotations
+import inspect
+import random
+import re
+from typing import (
+    Any,
+    Callable,
+    List,
+    Optional,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+    Dict,
+    Set,
+)
+from deepeval.errors import DeepEvalError
+from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
+from deepeval.prompt.prompt import Prompt
+from deepeval.prompt.api import PromptType, PromptMessage
+from deepeval.optimization.types import (
+    ModuleId,
+    PromptConfigurationId,
+    PromptConfiguration,
+    OptimizationReport,
+)
+if TYPE_CHECKING:
+    from deepeval.dataset.golden import Golden, ConversationalGolden
+    from deepeval.prompt.api import PromptMessage
+def split_goldens(
+    goldens: Union[List[Golden], List[ConversationalGolden]],
+    pareto_size: int,
+    *,
+    random_state: random.Random,
+) -> Tuple[
+    Union[List[Golden], List[ConversationalGolden]],
+    Union[List[Golden], List[ConversationalGolden]],
+]:
+    """
+    Split `goldens` into two disjoint parts:
+      - d_feedback: items not selected for the Pareto validation set
+      - d_pareto:   `pareto_size` items for instance-wise Pareto scoring
+    The selection is deterministic given `seed`. Within each split, the
+    original order from `goldens` is preserved.
+    Args:
+        goldens: Full list/sequence of examples.
+        pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
+        random_state: A shared `random.Random` instance that provides the source
+            of randomness. For reproducible runs, pass the same object used by
+            the GEPA loop constructed from `GEPAConfig.random_seed`
+    Returns:
+        (d_feedback, d_pareto)
+    """
+    if pareto_size < 0:
+        raise ValueError("pareto_size must be >= 0")
+    total = len(goldens)
+    if total == 0:
+        # nothing to split
+        return [], []
+    # With a single example, we cannot form a meaningful feedback set.
+    # callers like GEPARunner should enforce a minimum of 2 goldens for
+    # optimization.
+    if total == 1:
+        return [], list(goldens)
+    # For total >= 2, ensure that we always leave at least one example
+    # for d_feedback. This keeps the splits disjoint while still honoring
+    # pareto_size as a target up to (total - 1).
+    chosen_size = min(pareto_size, total - 1)
+    indices = list(range(total))
+    random_state.shuffle(indices)
+    pareto_indices = set(indices[:chosen_size])
+    d_pareto = [goldens[i] for i in range(total) if i in pareto_indices]
+    d_feedback = [goldens[i] for i in range(total) if i not in pareto_indices]
+    return d_feedback, d_pareto
+################################
+# Prompt normalization helpers #
+################################
+def _slug(text: str) -> str:
+    slug = text.lower()
+    slug = re.sub(r"[^a-z0-9]+", "-", slug)
+    return slug.strip("-")
+def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str:
+    """
+    Build a human readable module id stable within a single optimization run.
+    Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars.
+    """
+    parts: List[str] = []
+    if prompt.alias:
+        parts.append(str(prompt.alias))
+    if prompt.label:
+        parts.append(str(prompt.label))
+    ms = prompt.model_settings
+    if ms is not None:
+        if ms.provider is not None:
+            parts.append(ms.provider.value)
+        if ms.name:
+            parts.append(ms.name)
+    base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}"
+    base = base[:64] or f"module-{index+1}"
+    candidate = base
+    suffix = 2
+    while candidate in existing:
+        candidate = f"{base}-{suffix}"
+        candidate = candidate[:64]
+        suffix += 1
+    existing.add(candidate)
+    return candidate
+def normalize_seed_prompts(
+    seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]],
+) -> Dict[ModuleId, Prompt]:
+    """
+    Accept either {module_id: Prompt} or List[Prompt].
+    If a list is given, generate human readable module ids.
+    """
+    if isinstance(seed_prompts, dict):
+        return dict(seed_prompts)  # shallow copy
+    mapping: Dict[ModuleId, Prompt] = {}
+    used: Set[str] = set()
+    for i, prompt in enumerate(seed_prompts):
+        module_id = generate_module_id(prompt, i, used)
+        mapping[module_id] = prompt
+    return mapping
+def build_model_callback_kwargs(
+    *,
+    # scoring context
+    golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
+    # rewriter context
+    feedback_text: Optional[str] = None,
+    # shared
+    prompt: Optional[Prompt] = None,
+    prompt_type: Optional[str] = None,
+    prompt_text: Optional[str] = None,
+    prompt_messages: Optional[List["PromptMessage"]] = None,
+) -> Dict[str, Any]:
+    """
+    Build a superset of kwargs for GEPA model callbacks.
+    All keys are present in the dict so callbacks can declare any subset of:
+        hook: str           # injected by (a_)invoke_model_callback
+        prompt: Prompt
+        prompt_type: str
+        prompt_text: str
+        prompt_messages: List[PromptMessage]
+        golden: Golden | ConversationalGolden
+        feedback_text: str
+    Non applicable fields are set to None.
+    """
+    return {
+        # scoring context
+        "golden": golden,
+        # rewriter context
+        "feedback_text": feedback_text,
+        # shared
+        "prompt": prompt,
+        "prompt_text": prompt_text,
+        "prompt_messages": prompt_messages,
+    }
+def invoke_model_callback(
+    *,
+    hook: str,
+    model_callback: Callable[
+        ...,
+        Union[
+            str,
+            Dict,
+            Tuple[Union[str, Dict], float],
+        ],
+    ],
+    candidate_kwargs: Dict[str, Any],
+) -> Union[
+    str,
+    Dict,
+    Tuple[Union[str, Dict], float],
+]:
+    """
+    Call a user provided model_callback in a synchronous context.
+    - Filters kwargs to only those the callback accepts.
+    - Injects `hook` if the callback declares it.
+    - Raises if the callback returns an awaitable; callers must use async
+      helpers for async callbacks.
+    """
+    sig = inspect.signature(model_callback)
+    supported = set(sig.parameters.keys())
+    filtered = {
+        key: value
+        for key, value in candidate_kwargs.items()
+        if key in supported
+    }
+    if "hook" in supported:
+        filtered["hook"] = hook
+    result = model_callback(**filtered)
+    if inspect.isawaitable(result):
+        raise DeepEvalError(
+            "model_callback returned an awaitable from a synchronous context. "
+            "Either declare the callback as `async def` and use async GEPA, or call "
+            "`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
+        )
+    return result
+async def a_invoke_model_callback(
+    *,
+    hook: str,
+    model_callback: Callable[
+        ...,
+        Union[
+            str,
+            Dict,
+            Tuple[Union[str, Dict], float],
+        ],
+    ],
+    candidate_kwargs: Dict[str, Any],
+) -> Union[
+    str,
+    Dict,
+    Tuple[Union[str, Dict], float],
+]:
+    """
+    Call a user provided model_callback in an async context.
+    - Filters kwargs to only those the callback accepts.
+    - Injects `hook` if the callback declares it.
+    - Supports both sync and async callbacks.
+    """
+    sig = inspect.signature(model_callback)
+    supported = set(sig.parameters.keys())
+    filtered = {
+        key: value
+        for key, value in candidate_kwargs.items()
+        if key in supported
+    }
+    if "hook" in supported:
+        filtered["hook"] = hook
+    result = model_callback(**filtered)
+    if inspect.isawaitable(result):
+        return await result
+    return result
+###########
+# Reports #
+###########
+def build_prompt_config_snapshots(
+    prompt_configurations_by_id: Dict[
+        PromptConfigurationId, "PromptConfiguration"
+    ],
+) -> Dict[PromptConfigurationId, Dict[str, Any]]:
+    """
+    Build a serializable snapshot of all prompt configurations.
+    Shape matches the docs for `prompt_configurations`:
+    {
+      "<config_id>": {
+        "parent": "<parent_id or None>",
+        "prompts": {
+          "<module_id>": {
+            "type": "TEXT",
+            "text_template": "...",
+          }
+          # or
+          "<module_id>": {
+            "type": "LIST",
+            "messages": [
+              {"role": "system", "content": "..."},
+              ...
+            ],
+          },
+        },
+      },
+      ...
+    }
+    """
+    snapshots: Dict[PromptConfigurationId, Dict[str, Any]] = {}
+    for cfg_id, cfg in prompt_configurations_by_id.items():
+        prompts_snapshot: Dict[str, Any] = {}
+        for module_id, prompt in cfg.prompts.items():
+            if prompt.type is PromptType.LIST:
+                messages = [
+                    {"role": msg.role, "content": (msg.content or "")}
+                    for msg in (prompt.messages_template or [])
+                ]
+                prompts_snapshot[module_id] = {
+                    "type": "LIST",
+                    "messages": messages,
+                }
+            else:
+                prompts_snapshot[module_id] = {
+                    "type": "TEXT",
+                    "text_template": (prompt.text_template or ""),
+                }
+        snapshots[cfg_id] = {
+            "parent": cfg.parent,
+            "prompts": prompts_snapshot,
+        }
+    return snapshots
+def inflate_prompts_from_report(
+    report: OptimizationReport,
+) -> Dict[str, Dict[str, Prompt]]:
+    """
+    Build a mapping from configuration id -> { module_id -> Prompt }.
+    This is a convenience for users who want to work with real Prompt
+    instances instead of raw snapshots.
+    Returns:
+        {
+          "<config_id>": {
+            "<module_id>": Prompt(...),
+            ...
+          },
+          ...
+        }
+    """
+    inflated: Dict[str, Dict[str, Prompt]] = {}
+    for cfg_id, cfg_snapshot in report.prompt_configurations.items():
+        module_prompts: Dict[str, Prompt] = {}
+        for module_id, module_snapshot in cfg_snapshot.prompts.items():
+            if module_snapshot.type == "TEXT":
+                module_prompts[module_id] = Prompt(
+                    text_template=module_snapshot.text_template or ""
+                )
+            else:  # "LIST"
+                messages = [
+                    PromptMessage(role=m.role, content=m.content)
+                    for m in module_snapshot.messages or []
+                ]
+                module_prompts[module_id] = Prompt(messages_template=messages)
+        inflated[cfg_id] = module_prompts
+    return inflated
+def get_best_prompts_from_report(
+    report: OptimizationReport,
+) -> Dict[str, Prompt]:
+    """
+    Convenience wrapper returning the best configuration's module prompts.
+    """
+    all_prompts = inflate_prompts_from_report(report)
+    return all_prompts.get(report.best_id, {})
+##############
+# Validation #
+##############
+def _format_type_names(types: Tuple[type, ...]) -> str:
+    names = [t.__name__ for t in types]
+    if len(names) == 1:
+        return names[0]
+    if len(names) == 2:
+        return f"{names[0]} or {names[1]}"
+    return ", ".join(names[:-1]) + f", or {names[-1]}"
+def validate_instance(
+    *,
+    component: str,
+    param_name: str,
+    value: Any,
+    expected_types: Union[type, Tuple[type, ...]],
+    allow_none: bool = False,
+) -> Any:
+    """
+    Generic type validator.
+    - component: Intended to help identify what is being validated.
+        e.g. "PromptOptimizer.__init__", "PromptOptimizer.optimize", etc.
+    - param_name: the name of the parameter being validated
+    - value: the actual value passed.
+    - expected_types: a type or tuple of types to accept.
+    - allow_none: if True, None is allowed and returned as-is.
+    """
+    if value is None and allow_none:
+        return value
+    if not isinstance(expected_types, tuple):
+        expected_types = (expected_types,)
+    if not isinstance(value, expected_types):
+        expected_desc = _format_type_names(expected_types)
+        raise DeepEvalError(
+            f"{component} expected `{param_name}` to be an instance of "
+            f"{expected_desc}, but received {type(value).__name__!r} instead."
+        )
+    return value
+def validate_sequence_of(
+    *,
+    component: str,
+    param_name: str,
+    value: Any,
+    expected_item_types: Union[type, Tuple[type, ...]],
+    sequence_types: Tuple[type, ...] = (list, tuple),
+    allow_none: bool = False,
+) -> Any:
+    """
+    Generic container validator.
+    - Ensures `value` is one of `sequence_types` (list by default).
+    - Ensures each item is an instance of `expected_item_types`.
+    Returns the original `value` on success.
+    """
+    if value is None:
+        if allow_none:
+            return value
+        raise DeepEvalError(
+            f"{component} expected `{param_name}` to be a "
+            f"{_format_type_names(sequence_types)} of "
+            f"{_format_type_names(expected_item_types if isinstance(expected_item_types, tuple) else (expected_item_types,))}, "
+            "but received None instead."
+        )
+    if not isinstance(sequence_types, tuple):
+        sequence_types = (sequence_types,)
+    if not isinstance(value, sequence_types):
+        expected_seq = _format_type_names(sequence_types)
+        raise DeepEvalError(
+            f"{component} expected `{param_name}` to be a {expected_seq}, "
+            f"but received {type(value).__name__!r} instead."
+        )
+    if not isinstance(expected_item_types, tuple):
+        expected_item_types = (expected_item_types,)
+    for index, item in enumerate(value):
+        if not isinstance(item, expected_item_types):
+            expected_items = _format_type_names(expected_item_types)
+            raise DeepEvalError(
+                f"{component} expected all elements of `{param_name}` to be "
+                f"instances of {expected_items}, but element at index {index} "
+                f"has type {type(item).__name__!r}."
+            )
+    return value
+def validate_callback(
+    *,
+    component: str,
+    model_callback: Optional[
+        Callable[
+            ...,
+            Union[
+                str,
+                Dict,
+                Tuple[Union[str, Dict], float],
+            ],
+        ]
+    ],
+) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
+    """
+    Ensure that `model_callback` is provided.
+    - `model_callback` should be a callable that performs generation and
+      returns the model output.
+    Returns `model_callback` unchanged on success.
+    """
+    if model_callback is None:
+        raise DeepEvalError(
+            f"{component} requires a `model_callback`.\n\n"
+            "supply a custom callable via `model_callback=` that performs "
+            "generation and returns the model output."
+        )
+    return model_callback
+def validate_metrics(
+    *,
+    component: str,
+    metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
+) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
+    if metrics is None or not len(metrics):
+        raise DeepEvalError(
+            f"{component} requires a `metrics`.\n\n"
+            "supply one or more DeepEval metrics via `metrics=`"
+        )
+    validate_sequence_of(
+        component=component,
+        param_name="metrics",
+        value=metrics,
+        expected_item_types=(BaseMetric, BaseConversationalMetric),
+        sequence_types=(list, tuple),
+    )
+    return list(metrics)
+def validate_int_in_range(
+    *,
+    component: str,
+    param_name: str,
+    value: int,
+    min_inclusive: Optional[int] = None,
+    max_exclusive: Optional[int] = None,
+) -> int:
+    """
+    Validate that an int is within range [min_inclusive, max_exclusive).
+    - If `min_inclusive` is not None, value must be >= min_inclusive.
+    - If `max_exclusive` is not None, value must be < max_exclusive.
+    Returns the validated int on success.
+    """
+    value = validate_instance(
+        component=component,
+        param_name=param_name,
+        value=value,
+        expected_types=int,
+    )
+    # Lower bound check
+    if min_inclusive is not None and value < min_inclusive:
+        if max_exclusive is None:
+            raise DeepEvalError(
+                f"{component} expected `{param_name}` to be >= {min_inclusive}, "
+                f"but received {value!r} instead."
+            )
+        max_inclusive = max_exclusive - 1
+        raise DeepEvalError(
+            f"{component} expected `{param_name}` to be between "
+            f"{min_inclusive} and {max_inclusive} (inclusive), "
+            f"but received {value!r} instead."
+        )
+    # Upper bound check (half-open, < max_exclusive)
+    if max_exclusive is not None and value >= max_exclusive:
+        if min_inclusive is None:
+            raise DeepEvalError(
+                f"{component} expected `{param_name}` to be < {max_exclusive}, "
+                f"but received {value!r} instead."
+            )
+        max_inclusive = max_exclusive - 1
+        raise DeepEvalError(
+            f"{component} expected `{param_name}` to be between "
+            f"{min_inclusive} and {max_inclusive} (inclusive), "
+            f"but received {value!r} instead."
+        )
+    return value

deepeval/prompt/prompt.py CHANGED Viewed

@@ -4,12 +4,9 @@ import json
 import os
 from enum import Enum
-from typing import Optional, List, Dict, Type, Literal
+from typing import Optional, List, Dict, Type, Literal, TYPE_CHECKING
 from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
 from rich.console import Console
-import time
-import json
-import os
 from pydantic import BaseModel, ValidationError
 import asyncio
 import threading
@@ -38,6 +35,9 @@ from deepeval.confident.api import Api, Endpoints, HttpMethods
 from deepeval.constants import HIDDEN_DIR
+if TYPE_CHECKING:
+    from deepeval.optimization.types import OptimizationReport
 logger = logging.getLogger(__name__)
 portalocker = None
@@ -145,6 +145,9 @@ class Prompt:
         elif messages_template:
             self.type = PromptType.LIST
+        # updated after optimization runs
+        self.optimization_report: Optional["OptimizationReport"] = None
     def __del__(self):
         """Cleanup polling tasks when instance is destroyed"""
         try:
@@ -178,7 +181,7 @@ class Prompt:
             content = f.read()
         try:
             data = json.loads(content)
-        except (json.JSONDecodeError, TypeError):
+        except (TypeError, json.JSONDecodeError):
             self.text_template = content
             return content
@@ -364,6 +367,8 @@ class Prompt:
                 f.seek(0)
                 f.truncate()
                 json.dump(cache_data, f, cls=CustomEncoder)
+                f.flush()
+                os.fsync(f.fileno())
         except portalocker.exceptions.LockException:
             # If we can't acquire the lock, silently skip caching
             pass

deepeval/test_run/cache.py CHANGED Viewed

@@ -90,6 +90,8 @@ class CachedTestRun(BaseModel):
             # Pydantic version below 2.0
             body = self.dict(by_alias=True, exclude_none=True)
         json.dump(body, f, cls=CustomEncoder)
+        f.flush()
+        os.fsync(f.fileno())
         return self
     # load from file (this happens initially during a test run)

deepeval/test_run/test_run.py CHANGED Viewed

@@ -406,9 +406,10 @@ class TestRun(BaseModel):
         try:
             body = self.model_dump(by_alias=True, exclude_none=True)
         except AttributeError:
-            # Pydantic version below 2.0
             body = self.dict(by_alias=True, exclude_none=True)
         json.dump(body, f, cls=TestRunEncoder)
+        f.flush()
+        os.fsync(f.fileno())
         return self
     @classmethod
@@ -515,6 +516,8 @@ class TestRunManager:
                             )
                         wrapper_data = {save_under_key: test_run_data}
                         json.dump(wrapper_data, file, cls=TestRunEncoder)
+                        file.flush()
+                        os.fsync(file.fileno())
                     else:
                         self.test_run.save(file)
             except portalocker.exceptions.LockException:
@@ -527,6 +530,8 @@ class TestRunManager:
                     LATEST_TEST_RUN_FILE_PATH, mode="w"
                 ) as file:
                     json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
+                    file.flush()
+                    os.fsync(file.fileno())
             except portalocker.exceptions.LockException:
                 pass

deepeval/tracing/context.py CHANGED Viewed

@@ -73,6 +73,7 @@ def update_current_trace(
     tools_called: Optional[List[ToolCall]] = None,
     expected_tools: Optional[List[ToolCall]] = None,
     test_case: Optional[LLMTestCase] = None,
+    confident_api_key: Optional[str] = None,
 ):
     current_trace = current_trace_context.get()
     if not current_trace:
@@ -109,6 +110,8 @@ def update_current_trace(
         current_trace.tools_called = tools_called
     if expected_tools:
         current_trace.expected_tools = expected_tools
+    if confident_api_key:
+        current_trace.confident_api_key = confident_api_key
 def update_llm_span(

deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl

deepeval 3.7.2py3-none-any.whl → 3.7.4py3-none-any.whl