PyPI - deepeval - Versions diffs - 3.5.0__py3-none-any.whl → 3.5.2__py3-none-any.whl - Mend

deepeval 3.5.0py3-none-any.whl → 3.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

deepeval/_version.py +1 -1
deepeval/confident/api.py +2 -0
deepeval/integrations/langchain/__init__.py +2 -3
deepeval/integrations/langchain/callback.py +126 -280
deepeval/integrations/langchain/patch.py +24 -13
deepeval/integrations/langchain/utils.py +203 -1
deepeval/integrations/pydantic_ai/patcher.py +220 -185
deepeval/integrations/pydantic_ai/utils.py +86 -0
deepeval/metrics/__init__.py +1 -1
deepeval/metrics/answer_relevancy/template.py +13 -38
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +1 -0
deepeval/metrics/faithfulness/template.py +17 -27
deepeval/models/embedding_models/local_embedding_model.py +2 -2
deepeval/prompt/api.py +24 -2
deepeval/prompt/prompt.py +141 -17
deepeval/synthesizer/synthesizer.py +17 -9
deepeval/tracing/api.py +3 -0
deepeval/tracing/context.py +3 -1
deepeval/tracing/tracing.py +12 -2
deepeval/tracing/types.py +3 -0
deepeval/tracing/utils.py +6 -2
deepeval/utils.py +12 -0
{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/METADATA +1 -1
{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/RECORD +27 -26
{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/LICENSE.md +0 -0
{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/WHEEL +0 -0
{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/entry_points.txt +0 -0

deepeval/metrics/faithfulness/template.py CHANGED Viewed

@@ -76,42 +76,31 @@ The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states
 Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
 The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
-**
-IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
-Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
-Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
-Example:
+Expected JSON format:
 {{
     "verdicts": [
-        {{
-            "verdict": "idk",
-            "reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
-        }},
-        {{
-            "verdict": "idk",
-            "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
-        }},
         {{
             "verdict": "yes"
         }},
         {{
             "verdict": "no",
-            "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
+            "reason": <explanation_for_contradiction>
         }},
         {{
-            "verdict": "no",
-            "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
-        }},
+            "verdict": "idk",
+            "reason": <explanation_for_uncertainty>
+        }}
     ]
 }}
-===== END OF EXAMPLE ======
-The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
-You DON'T have to provide a reason if the answer is 'yes'.
-ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
-Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
-Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.
+Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
+No 'reason' needed for 'yes' verdicts.
+Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
+Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
+Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
 **
 Retrieval Contexts:
@@ -128,13 +117,14 @@ JSON:
         return f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
 Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
-**
-IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
-Example JSON:
+Expected JSON format:
 {{
     "reason": "The score is <faithfulness_score> because <your_reason>."
 }}
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
 If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
 Your reason MUST use information in `contradiction` in your reason.
 Be sure in your reason, as if you know what the actual output is from the contradictions.

deepeval/models/embedding_models/local_embedding_model.py CHANGED Viewed

@@ -41,7 +41,7 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
     async def a_embed_text(self, text: str) -> List[float]:
         embedding_model = self.load_model()
-        response = embedding_model.embeddings.create(
+        response = await embedding_model.embeddings.create(
             model=self.model_name,
             input=[text],
         )
@@ -49,7 +49,7 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
     async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
         embedding_model = self.load_model()
-        response = embedding_model.embeddings.create(
+        response = await embedding_model.embeddings.create(
             model=self.model_name,
             input=texts,
         )

deepeval/prompt/api.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, AliasChoices
 from enum import Enum
 from typing import List, Optional
@@ -20,8 +20,30 @@ class PromptType(Enum):
     LIST = "LIST"
+class PromptVersion(BaseModel):
+    id: str
+    version: str
+    commit_message: str = Field(
+        serialization_alias="commitMessage",
+        validation_alias=AliasChoices("commit_message", "commitMessage"),
+    )
+class PromptVersionsHttpResponse(BaseModel):
+    text_versions: Optional[List[PromptVersion]] = Field(
+        None,
+        serialization_alias="textVersions",
+        validation_alias=AliasChoices("text_versions", "textVersions"),
+    )
+    messages_versions: Optional[List[PromptVersion]] = Field(
+        None,
+        serialization_alias="messagesVersions",
+        validation_alias=AliasChoices("messages_versions", "messagesVersions"),
+    )
 class PromptHttpResponse(BaseModel):
-    promptVersionId: str
+    id: str
     text: Optional[str] = None
     messages: Optional[List[PromptMessage]] = None
     interpolation_type: PromptInterpolationType = Field(

deepeval/prompt/prompt.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from enum import Enum
-from typing import Optional, List
+from typing import Optional, List, Dict
 from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
 from rich.console import Console
 import time
 import json
 import os
 from pydantic import BaseModel
+import asyncio
 from deepeval.prompt.api import (
     PromptHttpResponse,
@@ -13,11 +14,15 @@ from deepeval.prompt.api import (
     PromptType,
     PromptInterpolationType,
     PromptPushRequest,
+    PromptVersionsHttpResponse,
 )
 from deepeval.prompt.utils import interpolate_text
 from deepeval.confident.api import Api, Endpoints, HttpMethods
 from deepeval.constants import HIDDEN_DIR
+from deepeval.utils import (
+    get_or_create_event_loop,
+    get_or_create_general_event_loop,
+)
 CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
@@ -63,7 +68,23 @@ class Prompt:
         self.alias = alias
         self._text_template = template
         self._messages_template = messages_template
-        self.version = None
+        self._version = None
+        self._polling_tasks: Dict[str, asyncio.Task] = {}
+        self._refresh_map: Dict[str, int] = {}
+    @property
+    def version(self):
+        if self._version is not None and self._version != "latest":
+            return self._version
+        versions = self._get_versions()
+        if len(versions) == 0:
+            return "latest"
+        else:
+            return versions[-1].version
+    @version.setter
+    def version(self, value):
+        self._version = value
     def interpolate(self, **kwargs):
         if self._type == PromptType.TEXT:
@@ -94,6 +115,20 @@ class Prompt:
         else:
             raise ValueError(f"Unsupported prompt type: {self._type}")
+    def _get_versions(self) -> List:
+        if self.alias is None:
+            raise ValueError(
+                "Prompt alias is not set. Please set an alias to continue."
+            )
+        api = Api()
+        data, _ = api.send_request(
+            method=HttpMethods.GET,
+            endpoint=Endpoints.PROMPTS_VERSIONS_ENDPOINT,
+            url_params={"alias": self.alias},
+        )
+        versions = PromptVersionsHttpResponse(**data)
+        return versions.text_versions or versions.messages_versions or []
     def _read_from_cache(
         self, alias: str, version: Optional[str] = None
     ) -> Optional[CachedPrompt]:
@@ -123,8 +158,16 @@ class Prompt:
         except Exception as e:
             raise Exception(f"Error reading Prompt cache from disk: {e}")
-    def _write_to_cache(self):
-        if not self.alias or not self.version:
+    def _write_to_cache(
+        self,
+        version: Optional[str] = None,
+        text_template: Optional[str] = None,
+        messages_template: Optional[List[PromptMessage]] = None,
+        prompt_version_id: Optional[str] = None,
+        type: Optional[PromptType] = None,
+        interpolation_type: Optional[PromptInterpolationType] = None,
+    ):
+        if not self.alias or not version:
             return
         cache_data = {}
@@ -140,14 +183,14 @@ class Prompt:
             cache_data[self.alias] = {}
         # Cache the prompt
-        cache_data[self.alias][self.version] = {
+        cache_data[self.alias][version] = {
             "alias": self.alias,
-            "version": self.version,
-            "template": self._text_template,
-            "messages_template": self._messages_template,
-            "prompt_version_id": self._prompt_version_id,
-            "type": self._type,
-            "interpolation_type": self._interpolation_type,
+            "version": version,
+            "template": text_template,
+            "messages_template": messages_template,
+            "prompt_version_id": prompt_version_id,
+            "type": type,
+            "interpolation_type": interpolation_type,
         }
         # Ensure directory exists
@@ -163,12 +206,23 @@ class Prompt:
         fallback_to_cache: bool = True,
         write_to_cache: bool = True,
         default_to_cache: bool = True,
+        refresh: Optional[int] = 60,
     ):
+        if refresh:
+            default_to_cache = True
+            write_to_cache = False
         if self.alias is None:
             raise TypeError(
                 "Unable to pull prompt from Confident AI when no alias is provided."
             )
+        # Manage background prompt polling
+        loop = get_or_create_general_event_loop()
+        if loop.is_running():
+            loop.create_task(self.create_polling_task(version, refresh))
+        else:
+            loop.run_until_complete(self.create_polling_task(version, refresh))
         if default_to_cache:
             try:
                 cached_prompt = self._read_from_cache(self.alias, version)
@@ -200,11 +254,14 @@ class Prompt:
             try:
                 data, _ = api.send_request(
                     method=HttpMethods.GET,
-                    endpoint=Endpoints.PROMPTS_ENDPOINT,
-                    params={"alias": self.alias, "version": version},
+                    endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
+                    url_params={
+                        "alias": self.alias,
+                        "versionId": version or "latest",
+                    },
                 )
                 response = PromptHttpResponse(
-                    promptVersionId=data["promptVersionId"],
+                    id=data["id"],
                     text=data.get("text", None),
                     messages=data.get("messages", None),
                     type=data["type"],
@@ -243,7 +300,7 @@ class Prompt:
             self.version = version or "latest"
             self._text_template = response.text
             self._messages_template = response.messages
-            self._prompt_version_id = response.promptVersionId
+            self._prompt_version_id = response.id
             self._type = response.type
             self._interpolation_type = response.interpolation_type
@@ -254,7 +311,14 @@ class Prompt:
                 description=f"{progress.tasks[task_id].description}[rgb(25,227,160)]Done! ({time_taken}s)",
             )
             if write_to_cache:
-                self._write_to_cache()
+                self._write_to_cache(
+                    version=version or "latest",
+                    text_template=response.text,
+                    messages_template=response.messages,
+                    prompt_version_id=response.id,
+                    type=response.type,
+                    interpolation_type=response.interpolation_type,
+                )
     def push(
         self,
@@ -300,3 +364,63 @@ class Prompt:
                 "✅ Prompt successfully pushed to Confident AI! View at "
                 f"[link={link}]{link}[/link]"
             )
+    ############################################
+    ### Polling
+    ############################################
+    async def create_polling_task(
+        self,
+        version: Optional[str],
+        refresh: Optional[int] = 60,
+    ):
+        if version is None:
+            return
+        # If polling task doesn't exist, start it
+        polling_task: Optional[asyncio.Task] = self._polling_tasks.get(version)
+        if refresh:
+            self._refresh_map[version] = refresh
+            if not polling_task:
+                self._polling_tasks[version] = asyncio.create_task(
+                    self.poll(version)
+                )
+        # If invalid `refresh`, stop the task
+        else:
+            if polling_task:
+                polling_task.cancel()
+            self._polling_tasks.pop(version)
+            self._refresh_map.pop(version)
+    async def poll(self, version: Optional[str] = None):
+        api = Api()
+        while True:
+            try:
+                data, _ = api.send_request(
+                    method=HttpMethods.GET,
+                    endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
+                    url_params={
+                        "alias": self.alias,
+                        "versionId": version or "latest",
+                    },
+                )
+                response = PromptHttpResponse(
+                    id=data["id"],
+                    text=data.get("text", None),
+                    messages=data.get("messages", None),
+                    type=data["type"],
+                    interpolation_type=data["interpolationType"],
+                )
+                self._write_to_cache(
+                    version=version or "latest",
+                    text_template=response.text,
+                    messages_template=response.messages,
+                    prompt_version_id=response.id,
+                    type=response.type,
+                    interpolation_type=response.interpolation_type,
+                )
+            except Exception as e:
+                pass
+            await asyncio.sleep(self._refresh_map[version])

deepeval/synthesizer/synthesizer.py CHANGED Viewed

@@ -361,7 +361,7 @@ class Synthesizer:
                 progress if _progress is None else nullcontext()
             ):
-                for i, context in enumerate(contexts):
+                for context_index, context in enumerate(contexts):
                     # Calculate pbar lengths
                     should_style = (
                         self.styling_config.input_format
@@ -381,7 +381,7 @@ class Synthesizer:
                     # Add pbars
                     pbar_generate_goldens_id = add_pbar(
                         progress,
-                        f"\t⚡ Generating goldens from context #{i}",
+                        f"\t⚡ Generating goldens from context #{context_index}",
                         total=1 + max_goldens_per_context,
                     )
                     pbar_generate_inputs_id = add_pbar(
@@ -421,7 +421,9 @@ class Synthesizer:
                         progress, pbar_generate_goldens_id, remove=False
                     )
-                    for j, data in enumerate(qualified_synthetic_inputs):
+                    for input_index, data in enumerate(
+                        qualified_synthetic_inputs
+                    ):
                         # Evolve input
                         evolved_input, evolutions_used = self._evolve_input(
                             input=data.input,
@@ -429,7 +431,9 @@ class Synthesizer:
                             num_evolutions=self.evolution_config.num_evolutions,
                             evolutions=self.evolution_config.evolutions,
                             progress=progress,
-                            pbar_evolve_input_id=pbar_evolve_input_ids[j],
+                            pbar_evolve_input_id=pbar_evolve_input_ids[
+                                input_index
+                            ],
                             remove_pbar=False,
                         )
@@ -441,7 +445,9 @@ class Synthesizer:
                                 task=self.styling_config.task,
                             )
                             update_pbar(
-                                progress, pbar_evolve_input_ids[j], remove=False
+                                progress,
+                                pbar_evolve_input_ids[input_index],
+                                remove=False,
                             )
                             res: SyntheticData = self._generate_schema(
                                 prompt,
@@ -455,15 +461,15 @@ class Synthesizer:
                             input=evolved_input,
                             context=context,
                             source_file=(
-                                source_files[i]
+                                source_files[context_index]
                                 if source_files is not None
                                 else None
                             ),
                             additional_metadata={
                                 "evolutions": evolutions_used,
-                                "synthetic_input_quality": scores[j],
+                                "synthetic_input_quality": scores[input_index],
                                 "context_quality": (
-                                    _context_scores[i]
+                                    _context_scores[context_index]
                                     if _context_scores is not None
                                     else None
                                 ),
@@ -480,7 +486,9 @@ class Synthesizer:
                             res = self._generate(prompt)
                             golden.expected_output = res
                             update_pbar(
-                                progress, pbar_evolve_input_ids[j], remove=False
+                                progress,
+                                pbar_evolve_input_ids[input_index],
+                                remove=False,
                             )
                         goldens.append(golden)

deepeval/tracing/api.py CHANGED Viewed

@@ -86,6 +86,9 @@ class BaseApiSpan(BaseModel):
     cost_per_output_token: Optional[float] = Field(
         None, alias="costPerOutputToken"
     )
+    token_intervals: Optional[Dict[str, str]] = Field(
+        None, alias="tokenIntervals"
+    )
     ## evals
     metric_collection: Optional[str] = Field(None, alias="metricCollection")

deepeval/tracing/context.py CHANGED Viewed

@@ -4,7 +4,6 @@ from contextvars import ContextVar
 from deepeval.tracing.types import BaseSpan, Trace
 from deepeval.test_case.llm_test_case import ToolCall, LLMTestCase
 from deepeval.tracing.types import LlmSpan, RetrieverSpan
-from deepeval.metrics import BaseMetric
 from deepeval.prompt.prompt import Prompt
 current_span_context: ContextVar[Optional[BaseSpan]] = ContextVar(
@@ -117,6 +116,7 @@ def update_llm_span(
     output_token_count: Optional[float] = None,
     cost_per_input_token: Optional[float] = None,
     cost_per_output_token: Optional[float] = None,
+    token_intervals: Optional[Dict[float, str]] = None,
     prompt: Optional[Prompt] = None,
 ):
     current_span = current_span_context.get()
@@ -132,6 +132,8 @@ def update_llm_span(
         current_span.cost_per_input_token = cost_per_input_token
     if cost_per_output_token:
         current_span.cost_per_output_token = cost_per_output_token
+    if token_intervals:
+        current_span.token_intervals = token_intervals
     if prompt:
         current_span.prompt = prompt

deepeval/tracing/tracing.py CHANGED Viewed

@@ -114,7 +114,7 @@ class TraceManager:
             self._print_trace_status(
                 message=f"WARNING: Exiting with {queue_size + in_flight} abaonded trace(s).",
                 trace_worker_status=TraceWorkerStatus.WARNING,
-                description=f"Set {CONFIDENT_TRACE_FLUSH}=YES as an environment variable to flush remaining traces to Confident AI.",
+                description=f"Set {CONFIDENT_TRACE_FLUSH}=1 as an environment variable to flush remaining traces to Confident AI.",
             )
     def mask(self, data: Any):
@@ -314,7 +314,7 @@ class TraceManager:
                     env_text,
                     message + ":",
                     description,
-                    f"\nTo disable dev logging, set {CONFIDENT_TRACE_VERBOSE}=NO as an environment variable.",
+                    f"\nTo disable dev logging, set {CONFIDENT_TRACE_VERBOSE}=0 as an environment variable.",
                 )
             else:
                 console.print(message_prefix, env_text, message)
@@ -717,6 +717,16 @@ class TraceManager:
             api_span.input_token_count = span.input_token_count
             api_span.output_token_count = span.output_token_count
+            processed_token_intervals = {}
+            if span.token_intervals:
+                for key, value in span.token_intervals.items():
+                    time = to_zod_compatible_iso(
+                        perf_counter_to_datetime(key),
+                        microsecond_precision=True,
+                    )
+                    processed_token_intervals[time] = value
+                api_span.token_intervals = processed_token_intervals
         return api_span

deepeval/tracing/types.py CHANGED Viewed

@@ -102,6 +102,9 @@ class LlmSpan(BaseSpan):
     cost_per_output_token: Optional[float] = Field(
         None, serialization_alias="costPerOutputToken"
     )
+    token_intervals: Optional[Dict[float, str]] = Field(
+        None, serialization_alias="tokenTimes"
+    )
     # for serializing `prompt`
     model_config = {"arbitrary_types_allowed": True}

deepeval/tracing/utils.py CHANGED Viewed

@@ -100,10 +100,14 @@ def make_json_serializable(obj):
     return _serialize(obj)
-def to_zod_compatible_iso(dt: datetime) -> str:
+def to_zod_compatible_iso(
+    dt: datetime, microsecond_precision: bool = False
+) -> str:
     return (
         dt.astimezone(timezone.utc)
-        .isoformat(timespec="milliseconds")
+        .isoformat(
+            timespec="microseconds" if microsecond_precision else "milliseconds"
+        )
         .replace("+00:00", "Z")
     )

deepeval/utils.py CHANGED Viewed

@@ -148,6 +148,18 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
     return loop
+def get_or_create_general_event_loop() -> asyncio.AbstractEventLoop:
+    try:
+        loop = asyncio.get_event_loop()
+        if loop.is_closed():
+            raise RuntimeError
+        return loop
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        return loop
 def set_should_skip_on_missing_params(yes: bool):
     s = get_settings()
     with s.edit(persist=False):

{deepeval-3.5.0.dist-info → deepeval-3.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepeval
-Version: 3.5.0
+Version: 3.5.2
 Summary: The LLM Evaluation Framework
 Home-page: https://github.com/confident-ai/deepeval
 License: Apache-2.0

deepeval 3.5.0__py3-none-any.whl → 3.5.2__py3-none-any.whl

deepeval 3.5.0py3-none-any.whl → 3.5.2py3-none-any.whl