PyPI - judgeval - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

judgeval 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

judgeval/common/tracer.py +183 -41
judgeval/constants.py +14 -3
judgeval/evaluation_run.py +2 -1
judgeval/judges/utils.py +14 -2
judgeval/judgment_client.py +46 -1
judgeval/scorers/judgeval_scorer.py +8 -8
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
judgeval/scorers/prompt_scorer.py +2 -2
judgeval/scorers/score.py +11 -11
judgeval/scorers/utils.py +3 -3
judgeval/tracer/__init__.py +3 -0
{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/METADATA +5 -4
{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/RECORD +16 -15
{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/WHEEL +0 -0
{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -8,8 +8,20 @@ import functools
 import requests
 import uuid
 from contextlib import contextmanager
-from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
-from dataclasses import dataclass, field
+from typing import (
+    Optional,
+    Any,
+    List,
+    Literal,
+    Tuple,
+    Generator,
+    TypeAlias,
+    Union
+)
+from dataclasses import (
+    dataclass,
+    field
+)
 from datetime import datetime
 from openai import OpenAI
 from together import Together
@@ -21,18 +33,26 @@ import json
 import warnings
 from pydantic import BaseModel
 from http import HTTPStatus
-from rich import print as rprint
-from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
+import pika
+import os
+from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
+from rich import print as rprint
 from judgeval.data.result import ScoringResult
+from judgeval.evaluation_run import EvaluationRun
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation']
 @dataclass
 class TraceEntry:
     """Represents a single trace entry with its visual representation.
@@ -54,7 +74,7 @@ class TraceEntry:
     # Use field() for mutable defaults to avoid shared state issues
     inputs: dict = field(default_factory=dict)
     span_type: SpanType = "span"
-    evaluation_result: Optional[List[ScoringResult]] = field(default=None)
+    evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
     def print_entry(self):
         indent = "  " * self.depth
@@ -67,7 +87,8 @@ class TraceEntry:
         elif self.type == "input":
             print(f"{indent}Input: {self.inputs}")
         elif self.type == "evaluation":
-            print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
+            for evaluation_run in self.evaluation_runs:
+                print(f"{indent}Evaluation: {evaluation_run.model_dump()}")
     def _serialize_inputs(self) -> dict:
         """Helper method to serialize input data safely.
@@ -114,7 +135,7 @@ class TraceEntry:
             "duration": self.duration,
             "output": self._serialize_output(),
             "inputs": self._serialize_inputs(),
-            "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
+            "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
             "span_type": self.span_type
         }
@@ -155,6 +176,106 @@ class TraceEntry:
             return self.output
         except (TypeError, OverflowError, ValueError):
             return safe_stringify(self.output, self.function)
+class TraceManagerClient:
+    """
+    Client for handling trace endpoints with the Judgment API
+    Operations include:
+    - Fetching a trace by id
+    - Saving a trace
+    - Deleting a trace
+    """
+    def __init__(self, judgment_api_key: str):
+        self.judgment_api_key = judgment_api_key
+    def fetch_trace(self, trace_id: str):
+        """
+        Fetch a trace by its id
+        """
+        response = requests.post(
+            JUDGMENT_TRACES_FETCH_API_URL,
+            json={
+                "trace_id": trace_id,
+                "judgment_api_key": self.judgment_api_key,
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to fetch traces: {response.text}")
+        return response.json()
+    def save_trace(self, trace_data: dict, empty_save: bool):
+        """
+        Saves a trace to the database
+        Args:
+            trace_data: The trace data to save
+            empty_save: Whether to save an empty trace
+            NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
+        """
+        response = requests.post(
+            JUDGMENT_TRACES_SAVE_API_URL,
+            json=trace_data,
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code == HTTPStatus.BAD_REQUEST:
+            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
+        elif response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to save trace data: {response.text}")
+        if not empty_save and "ui_results_url" in response.json():
+            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
+    def delete_trace(self, trace_id: str):
+        """
+        Delete a trace from the database.
+        """
+        response = requests.delete(
+            JUDGMENT_TRACES_DELETE_API_URL,
+            json={
+                "judgment_api_key": self.judgment_api_key,
+                "trace_ids": [trace_id],
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to delete trace: {response.text}")
+        return response.json()
+    def delete_traces(self, trace_ids: List[str]):
+        """
+        Delete a batch of traces from the database.
+        """
+        response = requests.delete(
+            JUDGMENT_TRACES_DELETE_API_URL,
+            json={
+                "judgment_api_key": self.judgment_api_key,
+                "trace_ids": trace_ids,
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to delete trace: {response.text}")
+        return response.json()
 class TraceClient:
     """Client for managing a single trace context"""
@@ -169,6 +290,7 @@ class TraceClient:
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
         self.overwrite = overwrite
+        self.trace_manager_client = TraceManagerClient(tracer.api_key)  # Manages DB operations for trace data
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -185,6 +307,7 @@ class TraceClient:
             span_type=span_type
         ))
+        # Increment nested depth and set current span
         self.tracer.depth += 1
         prev_span = self._current_span
         self._current_span = name
@@ -207,7 +330,7 @@ class TraceClient:
             ))
             self._current_span = prev_span
-    async def async_evaluate(
+    def async_evaluate(
         self,
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
         input: Optional[str] = None,
@@ -233,25 +356,40 @@ class TraceClient:
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
-        scoring_results = self.client.run_evaluation(
-            examples=[example],
-            scorers=scorers,
-            model=model,
-            metadata={},
+        try:
+            # Load appropriate implementations for all scorers
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
+                scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
+                for scorer in scorers
+            ]
+        except Exception as e:
+            raise ValueError(f"Failed to load scorers: {str(e)}")
+        eval_run = EvaluationRun(
             log_results=log_results,
             project_name=self.project_name,
-            eval_run_name=(
-                f"{self.name.capitalize()}-"
+            eval_name=f"{self.name.capitalize()}-"
                 f"{self._current_span}-"
-                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
-            ),
+                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
+            examples=[example],
+            scorers=loaded_scorers,
+            model=model,
+            metadata={},
+            judgment_api_key=self.tracer.api_key,
             override=self.overwrite
         )
-        self.record_evaluation(scoring_results, start_time)  # Pass start_time to record_evaluation
+        self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
-    def record_evaluation(self, results: List[ScoringResult], start_time: float):
-        """Record evaluation results for the current span"""
+    def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
+        """
+        Add evaluation run data to the trace
+        Args:
+            eval_run (EvaluationRun): The evaluation run to add to the trace
+            start_time (float): The start time of the evaluation run
+        """
         if self._current_span:
             duration = time.time() - start_time  # Calculate duration from start_time
@@ -261,7 +399,7 @@ class TraceClient:
                 depth=self.tracer.depth,
                 message=f"Evaluation results for {self._current_span}",
                 timestamp=time.time(),
-                evaluation_result=results,
+                evaluation_runs=[eval_run],
                 duration=duration,
                 span_type="evaluation"
             ))
@@ -342,7 +480,7 @@ class TraceClient:
                     "timestamp": entry["timestamp"],
                     "inputs": None,
                     "output": None,
-                    "evaluation_result": None,
+                    "evaluation_runs": [],
                     "span_type": entry.get("span_type", "span")
                 }
                 active_functions.append(function)
@@ -365,8 +503,8 @@ class TraceClient:
                 if entry["type"] == "output" and entry["output"]:
                     current_entry["output"] = entry["output"]
-                if entry["type"] == "evaluation" and entry["evaluation_result"]:
-                    current_entry["evaluation_result"] = entry["evaluation_result"]
+                if entry["type"] == "evaluation" and entry["evaluation_runs"]:
+                    current_entry["evaluation_runs"] = entry["evaluation_runs"]
         # Sort by timestamp
         condensed.sort(key=lambda x: x["timestamp"])
@@ -418,26 +556,30 @@ class TraceClient:
             "empty_save": empty_save,
             "overwrite": overwrite
         }
-        # Save trace data by making POST request to API
-        response = requests.post(
-            JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code == HTTPStatus.BAD_REQUEST:
-            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
-        elif response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to save trace data: {response.text}")
-        if not empty_save and "ui_results_url" in response.json():
-            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
+        if not empty_save:
+            connection = pika.BlockingConnection(
+                pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
+            channel = connection.channel()
+            channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
+            channel.basic_publish(
+                exchange='',
+                routing_key=RABBITMQ_QUEUE,
+                body=json.dumps(trace_data),
+                properties=pika.BasicProperties(
+                    delivery_mode=pika.DeliveryMode.Transient  # Changed from Persistent to Transient
+                ))
+            connection.close()
+        self.trace_manager_client.save_trace(trace_data, empty_save)
         return self.trace_id, trace_data
+    def delete(self):
+        return self.trace_manager_client.delete_trace(self.trace_id)
 class Tracer:
     _instance = None

judgeval/constants.py CHANGED Viewed

@@ -32,16 +32,25 @@ class APIScorer(str, Enum):
                 return member
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
-## API URLs
+# API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
 JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
+JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
+JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
+JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
+JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-## Models
+# RabbitMQ
+RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
+RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
+RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
+# Models
 TOGETHER_SUPPORTED_MODELS = {
     "QWEN": "Qwen/Qwen2-72B-Instruct",
     "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
@@ -51,7 +60,9 @@ TOGETHER_SUPPORTED_MODELS = {
     "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 }
-ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys())
+JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
+ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
 ## System settings
 MAX_WORKER_THREADS = 10

judgeval/evaluation_run.py CHANGED Viewed

@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
         examples (List[Example]): The examples to evaluate
-        scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
+        scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
     metadata: Optional[Dict[str, Any]] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
+    override: Optional[bool] = False
     def model_dump(self, **kwargs):
         data = super().model_dump(**kwargs)

judgeval/judges/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
 from judgeval.common.exceptions import InvalidJudgeModelError
 from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
-from judgeval.constants import TOGETHER_SUPPORTED_MODELS
+from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
 LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
@@ -33,7 +33,13 @@ def create_judge(
     # Either string or List[str]
     if isinstance(model, list):
         for m in model:
-            if m not in TOGETHER_SUPPORTED_MODELS and m not in LITELLM_SUPPORTED_MODELS:
+            if m in JUDGMENT_SUPPORTED_MODELS:
+                raise NotImplementedError(
+                    """Judgment models are not yet supported for local scoring.
+                    Please either set the `use_judgment` flag to True or use
+                    non-Judgment models."""
+                )
+            if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
                 raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
         return MixtureOfJudges(models=model), True
     # If model is a string, check that it corresponds to a valid model
@@ -41,5 +47,11 @@ def create_judge(
         return LiteLLMJudge(model=model), True
     if model in TOGETHER_SUPPORTED_MODELS:
         return TogetherJudge(model=model), True
+    if model in JUDGMENT_SUPPORTED_MODELS:
+        raise NotImplementedError(
+            """Judgment models are not yet supported for local scoring.
+            Please either set the `use_judgment` flag to True or use
+            non-Judgment models."""
+        )
     else:
         raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")

judgeval/judgment_client.py CHANGED Viewed

@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
     assert_test
 )
 from judgeval.judges import JudgevalJudge
-from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
+from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -194,6 +194,51 @@ class JudgmentClient:
             eval_run_result[0]["id"] = result_id
             eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
         return eval_run_result
+    def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
+        """
+        Deletes an evaluation from the server by project and run name.
+        Args:
+            project_name (str): Name of the project
+            eval_run_name (str): Name of the evaluation run
+        Returns:
+            bool: Whether the evaluation was successfully deleted
+        """
+        eval_run_request_body = EvalRunRequestBody(project_name=project_name,
+                                                   eval_name=eval_run_name,
+                                                   judgment_api_key=self.judgment_api_key)
+        response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
+                        json=eval_run_request_body.model_dump(),
+                        headers={
+                            "Content-Type": "application/json",
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting eval results: {response.json()}")
+        return response.json()
+    def delete_project_evals(self, project_name: str) -> bool:
+        """
+        Deletes all evaluations from the server for a given project.
+        Args:
+            project_name (str): Name of the project
+        Returns:
+            bool: Whether the evaluations were successfully deleted
+        """
+        response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
+                        json={
+                            "project_name": project_name,
+                            "judgment_api_key": self.judgment_api_key
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting eval results: {response.json()}")
+        return response.json()
     def _validate_api_key(self):
         """

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Custom Scorer class
+Judgeval Scorer class
 Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
 To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
@@ -57,12 +57,12 @@ class JudgevalScorer:
         verbose_logs: Optional[str] = None,
         additional_metadata: Optional[Dict] = None
         ):
-            debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
+            debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
             if not 0 <= threshold <= 1:
                 raise ValueError("Threshold must be between 0 and 1")
             if strict_mode:
                 warning("Strict mode enabled - scoring will be more rigorous")
-            info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
+            info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
             self.score_type = score_type
             self.threshold = threshold
             self.score = score
@@ -81,7 +81,7 @@ class JudgevalScorer:
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """
-        Adds the evaluation model to the CustomScorer instance
+        Adds the evaluation model to the JudgevalScorer instance
         This method is used at eval time
         """
@@ -116,10 +116,10 @@ class JudgevalScorer:
         raise NotImplementedError("You must implement the `passes` method in your custom scorer")
     def __str__(self):
-        debug("Converting CustomScorer instance to string representation")
+        debug("Converting JudgevalScorer instance to string representation")
         if self.error:
-            warning(f"CustomScorer contains error: {self.error}")
-        info(f"CustomScorer status - success: {self.success}, score: {self.score}")
+            warning(f"JudgevalScorer contains error: {self.error}")
+        info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
         attributes = {
             "score_type": self.score_type,
             "threshold": self.threshold,
@@ -137,4 +137,4 @@ class JudgevalScorer:
             "verbose_logs": self.verbose_logs,
             "additional_metadata": self.additional_metadata,
         }
-        return f"CustomScorer({attributes})"
+        return f"JudgevalScorer({attributes})"

judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Code for the local implementation of the Faithfulness metric.
 """
 from typing import List, Optional, Union
+from pprint import pprint
 from judgeval.constants import APIScorer
 from judgeval.data import (
     Example,
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
         ):
             self.claims = await self._a_generate_claims(example.actual_output)
             if self.additional_metadata is None:
                 self.additional_metadata = {}
             self.additional_metadata["claims"] = self.claims
             self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
             self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts]  # Add verdicts generated to metadata
             self.score = self._calculate_score()

judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py CHANGED Viewed

@@ -129,10 +129,13 @@ JSON:
     def create_verdicts(claims, retrieval_context):
         return f"""==== TASK INSTRUCTIONS ====
 You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
-Your task is to determine whether EACH claim is factually consistent with the retrieval context ("yes", "no", or "idk").
-ONLY choose 'no' if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
+I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
+For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
+YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
+Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
 Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
-Claims that are not justified by the retrieval context due to a lack of information MUST BE ANSWERED with 'idk'.
+Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
 ==== FORMATTING YOUR ANSWER ====
 Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
         )
-        # Then initialize CustomScorer
+        # Then initialize JudgevalScorer
         JudgevalScorer.__init__(
             self,
             score_type=name,
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
         )
-        # Then initialize CustomScorer
+        # Then initialize JudgevalScorer
         JudgevalScorer.__init__(
             self,
             score_type=name,

judgeval/scorers/score.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Infrastructure for executing evaluations of `Example`s using one or more `CustomScorer`s.
+Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
 """
@@ -30,15 +30,15 @@ async def safe_a_score_example(
 ):
     """
     Scoring task function when not using a progress indicator!
-    "Safely" scores an `Example` using a `CustomScorer` by gracefully handling any exceptions that may occur.
+    "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
     Args:
-        scorer (CustomScorer): The `CustomScorer` to use for scoring the example.
+        scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
         ignore_errors (bool): Whether to ignore errors during the evaluation.
         If set to false, any error will be raised and stop the evaluation.
-        If set to true, the error will be stored in the `error` attribute of the `CustomScorer` and the `success` attribute will be set to False.
+        If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
         skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
@@ -102,12 +102,12 @@ async def score_task(
     skip_on_missing_params: bool = True,
 ):
     """
-    Task function for asynchronously measuring a given example using a custom scorer.
+    Task function for asynchronously measuring a given example using a JudgevalScorer.
     Args:
         task_id (int): The ID of the task being measured.
         progress (Progress): An instance of the Progress class to track task progress.
-        scorer (CustomScorer): An instance of the CustomScorer class used to score the example.
+        scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
         example (Example): The example to be scored.
         ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
         skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
@@ -189,10 +189,10 @@ async def score_with_indicator(
     show_indicator: bool,
 ):
     """
-    Scores an example using a list of custom scorers, optionally displaying a progress indicator.
+    Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
     Args:
-        scorers (List[CustomScorer]): A list of custom scorer objects to evaluate the example.
+        scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
         example (Example): The example to be scored.
         ignore_errors (bool): If True, errors during scoring will be ignored.
         skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
@@ -253,8 +253,8 @@ async def a_execute_scoring(
     _use_bar_indicator: bool = True,
 ) -> List[ScoringResult]:
     """
-    Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
-    Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
+    Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
+    Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
     Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
     Evaluate a single example asynchronously using a list of scorers.
     Args:
-        scorers (List[CustomScorer]): List of CustomScorer objects to evaluate the example.
+        scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
         scoring_results (List[ScoringResult]): List to store the scoring results.
         score_index (int): Index at which the result should be stored in scoring_results.

judgeval/scorers/utils.py CHANGED Viewed

@@ -32,7 +32,7 @@ def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
         valid_args = {key: args[key] for key in valid_params if key in args}
         cloned_scorer = scorer_class(**valid_args)
-        # kinda hacky, but in case the class inheriting from CustomScorer doesn't have `model` in its __init__,
+        # kinda hacky, but in case the class inheriting from JudgevalScorer doesn't have `model` in its __init__,
         # we need to explicitly include it here so that we can add the judge model to the cloned scorer
         cloned_scorer._add_model(model=args.get("model"))
         cloned_scorers.append(cloned_scorer)
@@ -91,7 +91,7 @@ def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = No
     Args:
         llm_response (str): The response from an LLM.
-        scorer (CustomScorer, optional): The scorer object to forward errors to (if any).
+        scorer (JudgevalScorer, optional): The scorer object to forward errors to (if any).
     """
     start = llm_response.find("{")  # opening bracket
     end = llm_response.rfind("}") + 1  # closing bracket
@@ -129,7 +129,7 @@ def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
     Creates verbose logs for a scorer object.
     Args:
-        metric (CustomScorer): The scorer object.
+        metric (JudgevalScorer): The scorer object.
         steps (List[str]): The steps to be included in the verbose logs.
     Returns:

judgeval/tracer/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from judgeval.common.tracer import Tracer, wrap, TraceClient, TraceManagerClient
+__all__ = ["Tracer", "wrap", "TraceClient", "TraceManagerClient"]

{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.10
+Version: 0.0.11
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: anthropic
 Requires-Dist: fastapi
-Requires-Dist: langfuse==2.50.3
 Requires-Dist: litellm
 Requires-Dist: nest-asyncio
 Requires-Dist: openai
 Requires-Dist: pandas
-Requires-Dist: patronus
 Requires-Dist: pika
 Requires-Dist: python-dotenv==1.0.1
 Requires-Dist: requests
@@ -25,11 +23,14 @@ Requires-Dist: supabase
 Requires-Dist: together
 Requires-Dist: uvicorn
 Provides-Extra: dev
+Requires-Dist: langfuse==2.50.3; extra == 'dev'
+Requires-Dist: patronus; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
 Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
 Requires-Dist: pytest>=8.3.4; extra == 'dev'
+Requires-Dist: tavily-python; extra == 'dev'
 Description-Content-Type: text/markdown
 # judgeval
-Judgeval is a open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
+Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.

{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
 judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
-judgeval/constants.py,sha256=qwWc3EOpXSn9SHq5rylkHhnzH5WldedqSMCToa7vgZk,2040
-judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
-judgeval/judgment_client.py,sha256=jMeayUI-Z-GX4mVMVC9t5f7ENKLQ8dOepScYu5Yytf0,11777
+judgeval/constants.py,sha256=oL3kWHg9CzQJiTInDTgJgxRhF3fgylhvEVP360UqG8A,2654
+judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
+judgeval/judgment_client.py,sha256=thmSXi2essIlmd_j5SjlBw9_8qJJp6N3djoWdLaMrj0,13770
 judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
-judgeval/common/tracer.py,sha256=k5g9ZLeM-fLdV_q9NpodN8gW4nLTIXsbxeTaXVjm9jk,25658
+judgeval/common/tracer.py,sha256=wp-oGl8rdAe3_UXcvrEKFg7V6Vnvrnz9y_RVVgYOjCY,29934
 judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
 judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
 judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
@@ -24,15 +24,15 @@ judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
 judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
 judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
-judgeval/judges/utils.py,sha256=YUvivcGV1OKLPMJ9N6aTvhA0r_zzJ2NXriPguiiaVaY,2110
+judgeval/judges/utils.py,sha256=sYxSJq5cI9LtyJaxurcW9IwngALC9Ty8F_Mb8gz81nE,2732
 judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
 judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
 judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
-judgeval/scorers/judgeval_scorer.py,sha256=14SZ3sBZtGNM3BCegKebkNad9LTs5Tyhs0kD6l3wLAA,6275
-judgeval/scorers/prompt_scorer.py,sha256=bUv8eZNy1XGVM1gNMt33dgIVX6zj63bGAV6O0o0c7yg,17821
-judgeval/scorers/score.py,sha256=zJKG21h9Njyj2vS36CAFK2wlbOcHSKgrLgHV5_25KKw,18630
-judgeval/scorers/utils.py,sha256=dtueaJm8e3Ph3wj1vC-srzadgK_CoIlOefdvMQ-cwK8,6826
+judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
+judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
+judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
+judgeval/scorers/utils.py,sha256=X7lBI0LRBnBR8KUU-Fvont2Wq31t5p6zOTWGebWIcAU,6832
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
 judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
@@ -65,8 +65,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__i
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=6EHBfxWvhur9z14l8zCw5Z4Hb2uRo9Yv7qIhTRT7-aM,4591
 judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
-judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=4XqdcdgHg3evrg-IQwXmUHEyee1lZUjXRNEiQSvdpmQ,11341
-judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=oxmCsouh5ExUMmlSuCDolpYR2y9c-yKth6PHrdsCH_g,11387
+judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=fSxIn1uRvwCf7u4cOK4XrcPdS7OPzAWL9xt1pxujosY,11368
+judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
 judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py,sha256=fZk3UQxI9Nljf5qjCRLRkF0D-AERFHElI9cC83_cgV8,158
 judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py,sha256=orCrEe1IH4NE7m-AkKMX0EHbysTuAwIqfohcQaU7XxQ,9670
 judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py,sha256=BkEu7Q_jIVdcdZSq37tMjitZFzACd8-iBTDDXfGbZig,4346
@@ -77,7 +77,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
-judgeval-0.0.10.dist-info/METADATA,sha256=i9jeAPs3jY5hAHAdE_rlen4qJdEk0eAqQ0BOzMie97I,1205
-judgeval-0.0.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.10.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.10.dist-info/RECORD,,
+judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
+judgeval-0.0.11.dist-info/METADATA,sha256=WH8aPpUNCwE1Zr21qJ0H0WEVB_i_dilyLSbw9e5nXZo,1283
+judgeval-0.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.11.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.11.dist-info/RECORD,,

{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

judgeval 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl