PyPI - judgeval - Versions diffs - 0.0.6__tar.gz → 0.0.8__tar.gz - Mend

judgeval 0.0.6tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

{judgeval-0.0.6 → judgeval-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.6
+Version: 0.0.8
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,9 +14,11 @@ Requires-Dist: anthropic
 Requires-Dist: fastapi
 Requires-Dist: langfuse==2.50.3
 Requires-Dist: litellm
+Requires-Dist: nest-asyncio
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: patronus
+Requires-Dist: pika
 Requires-Dist: python-dotenv==1.0.1
 Requires-Dist: requests
 Requires-Dist: supabase

judgeval-0.0.8/docs/evaluation/unit_testing.mdx ADDED Viewed

@@ -0,0 +1,37 @@
+---
+title: Unit Testing
+---
+CI pipelines are the core of all mature software engineering practices.
+**With LLMs, developers should expect nothing less.**
+Using `judgeval`, you can easily unit test your LLM applications for consistency and quality in any metric of your choice.
+Unit testing is natively supported in `judgeval` through the `client.assert_test` method.
+**This also integrates with `pytest`, meaning you won't have to learn any new testing frameworks!**
+```python
+from judgeval import JudgmentClient
+from judgeval.data import Example
+from judgeval.scorers import FaithfulnessScorer
+def test_faithfulness():
+    client = JudgmentClient()
+    example = Example(
+        input="What is the capital of France?",
+        actual_output="The capital of France is Lyon.",
+        retrieval_context=["Come tour Paris' museums in the capital of France!"],
+    )
+    with pytest.raises(AssertionError):
+        client.assert_test(
+            eval_run_name="test_eval",
+            examples=[example],
+            scorers=[FaithfulnessScorer(threshold=1.0)],
+        )
+```
+`judgeval` naturally integrates into your CI pipelines, allowing you to execute robust unit tests across your entire codebase.
+**This allows you to catch regressions in your LLM applications before they make it to production!**

{judgeval-0.0.6 → judgeval-0.0.8}/docs/introduction.mdx RENAMED Viewed

@@ -82,8 +82,8 @@ Judgeval is designed for AI teams to easily benchmark and iterate on their LLM a
 - Construct powerful custom evaluation pipelines for your LLM systems.
 - Monitor LLM systems in production using state-of-the-art **real-time evaluation foundation models**.
-Judgeval integrates natively with the **Judgment Labs Platform**, allowing you to evaluate, regression test,
-and monitor LLM applications in the cloud.
+Judgeval integrates natively with the **Judgment Labs Platform**, allowing you to [evaluate](/evaluation/introduction), regression test,
+and [monitor](/monitoring/introduction) LLM applications in the cloud.
 Judgeval was built by a passionate team of LLM researchers from **Stanford, Datadog, and Together AI**.

{judgeval-0.0.6 → judgeval-0.0.8}/docs/mint.json RENAMED Viewed

@@ -70,7 +70,8 @@
             "evaluation/scorers/classifier_scorer"
           ]
         },
-        "evaluation/judges"
+        "evaluation/judges",
+        "evaluation/unit_testing"
       ]
     },
     {

{judgeval-0.0.6 → judgeval-0.0.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "judgeval"
-version = "0.0.6"
+version = "0.0.8"
 authors = [
     { name="Andrew Li", email="andrew@judgmentlabs.ai" },
     { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -28,6 +28,8 @@ dependencies = [
     "together",
     "anthropic",
     "patronus",
+    "nest-asyncio",
+    "pika",
 ]
 [project.optional-dependencies]

judgeval-0.0.8/src/demo/cookbooks/ci_testing/ci_testing.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""
+Cookbook for CI testing LLM applications using `judgeval`
+Includes unit tests and end-to-end tests for an OpenAI API-based travel agent
+"""
+import asyncio
+import os
+import pytest
+from demo.cookbooks.openai_travel_agent.agent import *
+from judgeval import JudgmentClient
+from judgeval.data import Example
+from judgeval.scorers import (
+    AnswerCorrectnessScorer,
+    AnswerRelevancyScorer,
+    FaithfulnessScorer
+)
+@pytest.fixture
+def judgment_client():
+    return JudgmentClient()
+@pytest.fixture
+def research_data():
+    return {
+        "attractions": [
+            "The iconic Eiffel Tower stands at 324 meters tall and welcomes over 7 million visitors annually. Visitors can access three levels, with the top floor offering panoramic views of Paris. The tower features two restaurants: 58 Tour Eiffel and the Michelin-starred Le Jules Verne.",
+            "The Louvre Museum houses over 380,000 objects and displays 35,000 works of art across eight departments. Home to the Mona Lisa and Venus de Milo, it's the world's largest art museum with 72,735 square meters of exhibition space. Visitors typically need 3-4 hours to see the highlights.",
+            "The historic district of Montmartre sits on Paris's highest hill at 130 meters. Famous for the white Sacré-Cœur Basilica and Place du Tertre filled with artists, it was once home to renowned painters like Picasso and Van Gogh. The area retains its village-like charm with winding cobblestone streets and authentic Parisian cafes."
+        ],
+        "hotels": [
+            "Hotel de la Paix is a luxurious 5-star establishment in the 16th arrondissement, featuring 85 rooms and suites decorated in classic Parisian style. The hotel offers a Michelin-starred restaurant, spa facilities, and is located just 10 minutes from the Arc de Triomphe.",
+            "Hotel de Paris, situated in the Opera district, combines Belle Époque architecture with modern amenities. Recently renovated in 2022, it offers 107 rooms, a rooftop bar with Eiffel Tower views, and has received the Palace distinction for exceptional service.",
+            "Hotel de Ville, a boutique hotel in Le Marais, occupies a restored 17th-century mansion. With 40 individually designed rooms, a courtyard garden, and acclaimed restaurant, it provides an authentic Parisian experience steps from Notre-Dame Cathedral."
+        ],
+        "flights": [
+            "Multiple daily direct flights to Paris Charles de Gaulle (CDG) from major US cities. Air France and United Airlines operate regular routes from JFK, LAX, and Chicago O'Hare. Flight times range from 7-11 hours depending on departure city.",
+            "From San Francisco International Airport (SFO), Air France operates a daily direct flight AF085 departing at 3:30 PM, arriving at CDG at 11:15 AM next day. United Airlines also offers UA990 with a similar schedule. Average flight time is 10 hours 45 minutes."
+        ],
+        "weather": "Paris in mid-February typically experiences cool winter conditions with average daytime temperatures ranging from 8-12°C (46-54°F). Current forecast shows mostly sunny conditions with occasional cloud cover. Morning temperatures around 6°C (43°F) rising to 12°C (54°F) by afternoon. Light breeze of 8-12 km/h expected with 20% chance of precipitation. Evening temperatures dropping to 4°C (39°F). UV index moderate at 3.",
+        "vector_db_results": []
+    }
+@pytest.fixture
+def sample_itinerary() -> str:
+    """
+    Loads the sample itinerary from the saved file
+    """
+    PATH_TO_ITINERARY = os.path.join(os.path.dirname(__file__), "travel_response.txt")
+    with open(PATH_TO_ITINERARY, 'r') as file:
+        return file.read()
+@pytest.fixture
+def expected_itinerary():
+    return """5-Day Paris Itinerary (February 11-15, 2025)
+Accommodation: Hotel de Paris in the Opera district
+- Selected for its central location, rooftop bar with Eiffel Tower views, and recent 2022 renovation
+Transportation:
+- Arrival via Air France flight AF085/United Airlines UA990 from SFO, landing at CDG at 11:15 AM
+Weather Considerations:
+- Pack warm clothing for temperatures between 4-12°C (39-54°F)
+- Morning activities planned indoors due to cooler temperatures
+- Outdoor activities scheduled during peak afternoon warmth
+Day 1 (Feb 11):
+- 11:15 AM: Arrival at CDG, transfer to Hotel de Paris
+- 2:00 PM: Hotel check-in and refresh
+- 3:30 PM: Visit the Eiffel Tower (taking advantage of afternoon warmth)
+- 7:00 PM: Dinner at Le Jules Verne in the Eiffel Tower
+Day 2 (Feb 12):
+- 9:00 AM: Breakfast at hotel
+- 10:00 AM: Louvre Museum visit (3-4 hours, indoor activity during cool morning)
+- 2:30 PM: Late lunch in Opera district
+- 4:00 PM: Rooftop bar at hotel for sunset views
+- Evening: Dinner at hotel's restaurant
+Day 3 (Feb 13):
+- 10:00 AM: Visit Montmartre (during warming temperatures)
+- 11:00 AM: Explore Sacré-Cœur Basilica
+- 12:30 PM: Lunch at local cafe in Montmartre
+- 2:00 PM: Artist square at Place du Tertre
+- Evening: Dinner at authentic Parisian bistro
+Day 4 (Feb 14):
+- Morning: Arc de Triomphe visit (10-minute walk from hotel)
+- Afternoon: Shopping and exploring Opera district
+- Evening: Valentine's Day dinner at hotel's Michelin-starred restaurant
+Day 5 (Feb 15):
+- Morning: Leisurely breakfast
+- Late morning: Check-out and departure
+Note: Indoor alternatives planned in case of precipitation (20% chance). Schedule optimized around temperature peaks of 12°C in afternoons."""
+def test_websearch_tool_answer_relevancy(judgment_client):
+    query = "What is the weather like in San Francisco on February 11th, 2025?"
+    results = search_tavily(query)
+    example = Example(
+        input=query,
+        actual_output=str(results)
+    )
+    scorer = AnswerRelevancyScorer(threshold=0.8)
+    judgment_client.assert_test(
+        examples=[example],
+        scorers=[scorer],
+        model="gpt-4o-mini",
+        project_name="travel_agent_tests",
+        eval_run_name="websearch_relevancy_test",
+        override=True
+    )
+def test_travel_planning_faithfulness(judgment_client, sample_itinerary, research_data):
+    destination = "Paris, France"
+    start_date = "February 11th, 2025"
+    end_date = "February 15th, 2025"
+    hotels_example = Example(
+        input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
+        actual_output=sample_itinerary,
+        retrieval_context=research_data["hotels"]
+    )
+    flights_example = Example(
+        input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
+        actual_output=sample_itinerary,
+        retrieval_context=research_data["flights"]
+    )
+    judgment_client.assert_test(
+        examples=[hotels_example, flights_example],
+        scorers=[FaithfulnessScorer(threshold=1.0)],
+        model="gpt-4o",
+        project_name="travel_agent_tests",
+        eval_run_name="travel_planning_faithfulness_test",
+        override=True
+    )
+def test_travel_planning_answer_correctness(judgment_client, sample_itinerary, expected_itinerary):
+    destination = "Paris, France"
+    start_date = "February 11th, 2025"
+    end_date = "February 15th, 2025"
+    example = Example(
+        input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
+        actual_output=sample_itinerary,
+        expected_output=expected_itinerary
+    )
+    with pytest.raises(AssertionError):
+        judgment_client.assert_test(
+            examples=[example],
+            scorers=[AnswerCorrectnessScorer(threshold=0.75)],
+            model="gpt-4o",
+            project_name="travel_agent_tests",
+            eval_run_name="travel_planning_correctness_test",
+            override=True
+        )
+def save_travel_response(destination, start_date, end_date, research_data, file_path):
+    response = asyncio.run(create_travel_plan(destination, start_date, end_date, research_data))
+    with open(file_path, 'w') as f:
+        f.write(response)
+if __name__ == "__main__":
+    sample_research_data = {
+        "attractions": [
+            "The iconic Eiffel Tower stands at 324 meters tall and welcomes over 7 million visitors annually. Visitors can access three levels, with the top floor offering panoramic views of Paris. The tower features two restaurants: 58 Tour Eiffel and the Michelin-starred Le Jules Verne.",
+            "The Louvre Museum houses over 380,000 objects and displays 35,000 works of art across eight departments. Home to the Mona Lisa and Venus de Milo, it's the world's largest art museum with 72,735 square meters of exhibition space. Visitors typically need 3-4 hours to see the highlights.",
+            "The historic district of Montmartre sits on Paris's highest hill at 130 meters. Famous for the white Sacré-Cœur Basilica and Place du Tertre filled with artists, it was once home to renowned painters like Picasso and Van Gogh. The area retains its village-like charm with winding cobblestone streets and authentic Parisian cafes."
+        ],
+        "hotels": [
+            "Hotel de la Paix is a luxurious 5-star establishment in the 16th arrondissement, featuring 85 rooms and suites decorated in classic Parisian style. The hotel offers a Michelin-starred restaurant, spa facilities, and is located just 10 minutes from the Arc de Triomphe.",
+            "Hotel de Paris, situated in the Opera district, combines Belle Époque architecture with modern amenities. Recently renovated in 2022, it offers 107 rooms, a rooftop bar with Eiffel Tower views, and has received the Palace distinction for exceptional service.",
+            "Hotel de Ville, a boutique hotel in Le Marais, occupies a restored 17th-century mansion. With 40 individually designed rooms, a courtyard garden, and acclaimed restaurant, it provides an authentic Parisian experience steps from Notre-Dame Cathedral."
+        ],
+        "flights": [
+            "Multiple daily direct flights to Paris Charles de Gaulle (CDG) from major US cities. Air France and United Airlines operate regular routes from JFK, LAX, and Chicago O'Hare. Flight times range from 7-11 hours depending on departure city.",
+            "From San Francisco International Airport (SFO), Air France operates a daily direct flight AF085 departing at 3:30 PM, arriving at CDG at 11:15 AM next day. United Airlines also offers UA990 with a similar schedule. Average flight time is 10 hours 45 minutes."
+        ],
+        "weather": "Paris in mid-February typically experiences cool winter conditions with average daytime temperatures ranging from 8-12°C (46-54°F). Current forecast shows mostly sunny conditions with occasional cloud cover. Morning temperatures around 6°C (43°F) rising to 12°C (54°F) by afternoon. Light breeze of 8-12 km/h expected with 20% chance of precipitation. Evening temperatures dropping to 4°C (39°F). UV index moderate at 3.",
+        "vector_db_results": []
+    }
+    save_travel_response("Paris, France", "February 11th, 2025", "February 15th, 2025", sample_research_data, "./travel_response.txt")

judgeval-0.0.8/src/demo/cookbooks/ci_testing/travel_response.txt ADDED Viewed

@@ -0,0 +1,52 @@
+Travel Itinerary: Paris, France (February 11th, 2025 - February 15th, 2025)
+Day 1 - February 11th, 2025:
+Morning:
+- As per your departure location, fly out on a direct flight on Air France or United Airlines to Paris Charles de Gaulle (CDG).
+Afternoon:
+- Arrival at CDG at approximately 11:15 AM. Take a taxi or a private car service to Hotel de Ville in Le Marais where you'll be staying. Check-in and freshen up.
+Evening:
+- Explore the Le Marais district, discover its historic buildings, avant-garde fashion boutiques, and vibrant food scene.
+Day 2 - February 12th, 2025:
+Morning & Afternoon:
+- Visit the iconic Louvre Museum. We recommend arriving early to beat the crowds and spend 3-4 hours viewing the classic works of art, especially the Mona Lisa and Venus de Milo.
+Evening:
+- Follow the Seine river and take a leisurely evening walk from the Louvre to Notre-Dame Cathedral, a masterpiece of French Gothic architecture.
+Day 3 - February 13th, 2025:
+Morning:
+- Dedicate this day to explore the historical district of Montmartre. Start by visiting the Sacré-Cœur Basilica, and enjoy the magnificent views of Paris.
+Afternoon:
+- Visit the famous square Place du Tertre, known for its artists, and have lunch at one of the authentic Parisian cafes surrounding the square.
+Evening:
+- Explore the Montmartre district, with its cobblestone streets and boutiques. End the day with dinner at a local bistro.
+Day 4 - February 14th, 2025:
+Morning:
+- Take a tour to one of the world's most famous landmarks, the Eiffel Tower. Get to the top floor to enjoy a breathtaking view of Paris.
+Afternoon:
+- Have lunch at the Michelin-starred Le Jules Verne, situated on the Eiffel Tower itself.
+Evening:
+- Head back to your hotel and freshen up. Go out to Hotel de Paris' rooftop for drinks and enjoy the Eiffel Tower views.
+Day 5 - February 15th, 2025:
+Morning:
+ - Breakfast and check-out. Visit any local attractions or shopping streets that you might be interested in if time allows.
+Afternoon:
+- Departure: Take a taxi or a private car to Paris Charles de Gaulle (CDG) Airport in time for your flight home.
+Note: Pack for cool winter conditions in mid-February with average daytime temperatures ranging from 8-12°C. It's always recommended to check the forecast closer to your departure date.

{judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/openai_travel_agent/agent.py RENAMED Viewed

@@ -1,12 +1,10 @@
 import openai
-import requests
 import os
 import asyncio
 from tavily import TavilyClient
 from dotenv import load_dotenv
 import chromadb
 from chromadb.utils import embedding_functions
-import json
 from judgeval.common.tracer import Tracer, wrap
 from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judgment_client.py RENAMED Viewed

@@ -267,7 +267,6 @@ class JudgmentClient:
         return response.json()["slug"]
     def assert_test(
         self,
         examples: List[Example],
@@ -275,12 +274,14 @@ class JudgmentClient:
         model: Union[str, List[str], JudgevalJudge],
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
-        log_results: bool = False,
-        project_name: str = "",
-        eval_run_name: str = "",
+        log_results: bool = True,
+        project_name: str = "default_project",
+        eval_run_name: str = "default_eval_run",
         override: bool = False,
     ) -> None:
+        """
+        Asserts a test by running the evaluation and checking the results for success
+        """
         results = self.run_evaluation(
             examples=examples,
             scorers=scorers,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorer.py RENAMED Viewed

@@ -58,6 +58,8 @@ class JudgevalScorer:
         additional_metadata: Optional[Dict] = None
         ):
             debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
+            if not 0 <= threshold <= 1:
+                raise ValueError("Threshold must be between 0 and 1")
             if strict_mode:
                 warning("Strict mode enabled - scoring will be more rigorous")
             info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from typing import Optional, List, Union, Tuple
 from pydantic import BaseModel
+from judgeval.constants import APIScorer
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.data import Example, ExampleParams
@@ -38,13 +39,17 @@ class AnswerCorrectnessScorer(JudgevalScorer):
         strict_mode: bool = False,
         verbose_mode: bool = False
     ):
-        self.threshold = 1 if strict_mode else threshold
-        self.include_reason = include_reason
+        super().__init__(
+            score_type=APIScorer.ANSWER_CORRECTNESS,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     async def _a_get_statements(self, expected_output: str) -> List[str]:
         prompt = AnswerCorrectnessTemplate.deduce_statements(

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from typing import Optional, List, Union, Tuple
+from judgeval.constants import APIScorer
 from judgeval.scorers.utils import (get_or_create_event_loop,
                                     scorer_progress_meter,
                                     create_verbose_logs,
@@ -34,13 +35,18 @@ class AnswerRelevancyScorer(JudgevalScorer):
         strict_mode: bool = False,
         verbose_mode: bool = False,
     ):
-        self.threshold = 1 if strict_mode else threshold
+        super().__init__(
+            score_type=APIScorer.ANSWER_RELEVANCY,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.include_reason = include_reason
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
+        print(self.model)
     def score_example(
         self,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py RENAMED Viewed

@@ -4,6 +4,7 @@ from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.data import Example, ExampleParams
 from judgeval.scorers import JudgevalScorer
+from judgeval.constants import APIScorer
 from judgeval.scorers.utils import (
     get_or_create_event_loop,
     parse_response_json,
@@ -30,13 +31,17 @@ class ContextualPrecisionScorer(JudgevalScorer):
         strict_mode: bool = False,
         verbose_mode: bool = False,
     ):
-        self.threshold = 1 if strict_mode else threshold
-        self.include_reason = include_reason
+        super().__init__(
+            score_type=APIScorer.CONTEXTUAL_PRECISION,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     def score_example(
         self,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from typing import Optional, List, Union
+from judgeval.constants import APIScorer
 from judgeval.scorers.utils import (
     get_or_create_event_loop,
     parse_response_json,
@@ -32,14 +33,18 @@ class ContextualRecallScorer(JudgevalScorer):
         verbose_mode: bool = False,
         user: Optional[str] = None
     ):
+        super().__init__(
+            score_type=APIScorer.CONTEXTUAL_RECALL,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.user = user
-        self.threshold = 1 if strict_mode else threshold
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.include_reason = include_reason
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     def score_example(
         self,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from typing import Optional, List, Union
 import asyncio
+from judgeval.constants import APIScorer
 from judgeval.scorers.utils import (get_or_create_event_loop,
                                     scorer_progress_meter,
                                     create_verbose_logs,
@@ -32,14 +33,18 @@ class ContextualRelevancyScorer(JudgevalScorer):
         verbose_mode: bool = False,
         user: Optional[str] = None
     ):
+        super().__init__(
+            score_type=APIScorer.CONTEXTUAL_RELEVANCY,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.user = user
-        self.threshold = 1 if strict_mode else threshold
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.include_reason = include_reason
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     def score_example(
         self,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py RENAMED Viewed

@@ -3,7 +3,7 @@ Code for the local implementation of the Faithfulness metric.
 """
 from typing import List, Optional, Union
+from judgeval.constants import APIScorer
 from judgeval.data import (
     Example,
     ExampleParams
@@ -47,14 +47,19 @@ class FaithfulnessScorer(JudgevalScorer):
         verbose_mode: bool = False,
         user: Optional[str] = None
     ):
-        self.threshold = 1 if strict_mode else threshold
+        super().__init__(
+            score_type=APIScorer.FAITHFULNESS,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
+        self.user = user
         self.model, self.using_native_model = create_judge(model)
         self.using_native_model = True  # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
         self.evaluation_model = self.model.get_model_name()
-        self.include_reason = include_reason
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     def score_example(
         self,

{judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py RENAMED Viewed

@@ -20,6 +20,7 @@ Hallucination is measuring the fraction of contexts that agree with output (do n
 from typing import Optional, Union, List
+from judgeval.constants import APIScorer
 from judgeval.scorers.utils import (get_or_create_event_loop,
                                     scorer_progress_meter,
                                     create_verbose_logs,
@@ -50,13 +51,17 @@ class HallucinationScorer(JudgevalScorer):
         strict_mode: bool = False,
         verbose_mode: bool = False,
     ):
-        self.threshold = 1 if strict_mode else threshold
+        super().__init__(
+            score_type=APIScorer.HALLUCINATION,
+            threshold=1 if strict_mode else threshold,
+            evaluation_model=None,
+            include_reason=include_reason,
+            async_mode=async_mode,
+            strict_mode=strict_mode,
+            verbose_mode=verbose_mode
+        )
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        self.include_reason = include_reason
-        self.async_mode = async_mode
-        self.strict_mode = strict_mode
-        self.verbose_mode = verbose_mode
     def score_example(
         self,

judgeval-0.0.8/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
+__all__ = [
+    "JsonCorrectnessScorer",
+]

judgeval 0.0.6__tar.gz → 0.0.8__tar.gz

judgeval 0.0.6tar.gz → 0.0.8tar.gz