PyPI - judgeval - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

judgeval 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

judgeval/common/tracer.py +104 -28
judgeval/common/utils.py +12 -13
judgeval/constants.py +61 -10
judgeval/data/datasets/dataset.py +1 -1
judgeval/data/datasets/eval_dataset_client.py +0 -1
judgeval/evaluation_run.py +8 -0
judgeval/judges/together_judge.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +139 -14
judgeval/rules.py +384 -0
judgeval/run_evaluation.py +16 -5
judgeval/scorers/api_scorer.py +11 -12
judgeval/scorers/base_scorer.py +1 -1
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
judgeval/utils/alerts.py +43 -0
{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/METADATA +1 -1
{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/RECORD +19 -17
{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/WHEEL +0 -0
{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -20,6 +20,7 @@ from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
+    MAX_CONCURRENT_EVALUATIONS
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.evaluation_run import EvaluationRun
@@ -30,6 +31,7 @@ from judgeval.common.logger import (
     error,
     example_logging_context
 )
+from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
 def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
@@ -228,6 +230,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
 def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -245,7 +248,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
             metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
             judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
             log_results (bool): Whether to log the results to the Judgment API
+            rules (Optional[List[Rule]]): Rules to evaluate against scoring results
     Returns:
         List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
@@ -316,7 +319,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 metadata=evaluation_run.metadata,
                 judgment_api_key=evaluation_run.judgment_api_key,
                 organization_id=evaluation_run.organization_id,
-                log_results=evaluation_run.log_results
+                log_results=evaluation_run.log_results,
+                rules=evaluation_run.rules
             )
             debug("Sending request to Judgment API")
             response_data: List[Dict] = execute_api_eval(api_evaluation_run)  # Dicts are `ScoringResult` objs
@@ -346,7 +350,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                     ]
                 api_results.append(ScoringResult(**filtered_result))
     # Run local evals
     if local_scorers:  # List[JudgevalScorer]
         info("Starting local evaluation")
@@ -364,12 +367,11 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 show_indicator=True,
                 _use_bar_indicator=True,
                 throttle_value=0,
-                max_concurrent=100,
+                max_concurrent=MAX_CONCURRENT_EVALUATIONS,
             )
         )
         local_results = results
         info(f"Local evaluation complete with {len(local_results)} results")
     # Aggregate the ScorerData from the API and local evaluations
     debug("Merging API and local results")
     merged_results: List[ScoringResult] = merge_results(api_results, local_results)
@@ -377,6 +379,15 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     info(f"Successfully merged {len(merged_results)} results")
+    # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
+    # if evaluation_run.rules and merged_results:
+    #     run_rules(
+    #         local_results=merged_results,
+    #         rules=evaluation_run.rules,
+    #         judgment_api_key=evaluation_run.judgment_api_key,
+    #         organization_id=evaluation_run.organization_id
+    #     )
     if evaluation_run.log_results:
         log_evaluation_results(merged_results, evaluation_run)

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -34,22 +34,22 @@ class APIJudgmentScorer(BaseModel):
     @field_validator('score_type')
     def convert_to_enum_value(cls, v):
         """
-        Validates that the `score_type` is a valid `JudgmentMetric` enum value.
-        Converts string values to `JudgmentMetric` enum values.
+        Validates that the `score_type` is a valid `APIScorer` enum value.
+        Converts string values to `APIScorer` enum values.
         """
         debug(f"Attempting to convert score_type value: {v}")
         if isinstance(v, APIScorer):
-            info(f"Using existing JudgmentMetric: {v.value}")
-            return v.value
+            info(f"Using existing APIScorer: {v}")
+            return v
         elif isinstance(v, str):
-            debug(f"Converting string value to JudgmentMetric enum: {v}")
-            return APIScorer[v.upper()].value
+            debug(f"Converting string value to APIScorer enum: {v}")
+            return APIScorer[v.upper()]
         error(f"Invalid score_type value: {v}")
         raise ValueError(f"Invalid value for score_type: {v}")
     def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
+        return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
     def to_dict(self) -> dict:
         """
         Converts the scorer configuration to a dictionary format.
@@ -58,7 +58,6 @@ class APIJudgmentScorer(BaseModel):
             dict: A dictionary containing the scorer's configuration
         """
         return {
-            "score_type": self.score_type,
+            "score_type": str(self.score_type.value),  # Convert enum to string for serialization
             "threshold": self.threshold
-        }
+        }

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -48,5 +48,5 @@ class APIJudgmentScorer(BaseModel):
         raise ValueError(f"Invalid value for score_type: {v}")
     def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
+        return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"

judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py CHANGED Viewed

@@ -46,7 +46,6 @@ class AnswerRelevancyScorer(JudgevalScorer):
         )
         self.model, self.using_native_model = create_judge(model)
         self.evaluation_model = self.model.get_model_name()
-        print(self.model)
     def score_example(
         self,

judgeval/utils/alerts.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+Handling alerts in Judgeval.
+"""
+from enum import Enum
+from typing import Dict, Any, List, Optional
+from pydantic import BaseModel
+class AlertStatus(str, Enum):
+    """Status of an alert evaluation."""
+    TRIGGERED = "triggered"
+    NOT_TRIGGERED = "not_triggered"
+class AlertResult(BaseModel):
+    """
+    Result of a rule evaluation.
+    Attributes:
+        rule_name: Name of the rule that was evaluated
+        rule_id: Unique identifier of the rule
+        status: Status of the alert (triggered or not)
+        conditions_result: List of condition evaluation results
+        metadata: Dictionary containing example_id, timestamp, and other metadata
+    """
+    rule_name: str
+    rule_id: Optional[str] = None  # The unique identifier of the rule
+    status: AlertStatus
+    conditions_result: List[Dict[str, Any]] = []
+    metadata: Dict[str, Any] = {}
+    @property
+    def example_id(self) -> Optional[str]:
+        """Get example_id from metadata for backward compatibility"""
+        return self.metadata.get("example_id")
+    @property
+    def timestamp(self) -> Optional[str]:
+        """Get timestamp from metadata for backward compatibility"""
+        return self.metadata.get("timestamp")
+    @property
+    def conditions_results(self) -> List[Dict[str, Any]]:
+        """Backwards compatibility property for the conditions_result field"""
+        return self.conditions_result

{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.14
+Version: 0.0.16
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/RECORD RENAMED Viewed

@@ -1,33 +1,34 @@
 judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
 judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
-judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
-judgeval/evaluation_run.py,sha256=59lG8AUFTKqbY_JVEEA0I093-Pmiy0ERYDK5BuXuEGg,5965
-judgeval/judgment_client.py,sha256=ryGT3A9-Him6oco3WvuHbjB-FVvAR3wCiiGz03eO_Q4,15409
-judgeval/run_evaluation.py,sha256=Cc7BS07WyqsNpQ38HdMdRI782N3DANjM8UcIq9AwaGA,20769
+judgeval/constants.py,sha256=9ndMvpjXEUPF3RlZZXyTXGDVWE3roH9MH6CZ9h38lrc,4700
+judgeval/evaluation_run.py,sha256=qGNafAPLW9tq6KV3eDolIASodLxcSBWQO0mQ0Aq9Cig,6285
+judgeval/judgment_client.py,sha256=fg45YySB-WbusPw3JmsQEvMIiqBzKcZiXl5I8ewRft0,23252
+judgeval/rules.py,sha256=6RT3DdMvu-LpjqiSDMAd4d2T8S8pOfc_IBEvTgf3GS4,15665
+judgeval/run_evaluation.py,sha256=1BRIaDnu622lzgEgbNzD5T4_hbNfMaJZqiicBbEBGQg,21416
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
-judgeval/common/tracer.py,sha256=qam2suh-0_Cu_B7AWg3AMfEo2TisRZVY1SnAfqhiFQo,33211
-judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
+judgeval/common/tracer.py,sha256=pggEJTAH690jg7LPTfVBdGG3IMjkqpbsUBIoSnGtWes,36477
+judgeval/common/utils.py,sha256=T1lpObopcH868NIgOTzNViTB33OGadcVWxWcfh2pm3E,33439
 judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
 judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
 judgeval/data/example.py,sha256=Rd-eDEM-giYfkfsGh_PBS2wwl15QlQPzbMV-J64Yj5E,2991
 judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
 judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
 judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
-judgeval/data/datasets/dataset.py,sha256=KdAY0KRUB2jxcGmc1XXXheFFcPsGFOIGY-kTwBNQS_Y,12080
-judgeval/data/datasets/eval_dataset_client.py,sha256=DzxWQIiHlbpg6FpmWY6brcSP_h_rGcztk2A_6tQNFys,11411
+judgeval/data/datasets/dataset.py,sha256=FRl2efBQZEpyK_ZTM7FMQQ7wjmtvcHCMFBq8L7O2Wus,12080
+judgeval/data/datasets/eval_dataset_client.py,sha256=UDzWOYlJqMDEYLyp9IyNBXWrXG2TyZ2L2JJnzly2mgc,11353
 judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
 judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
 judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
 judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8,1000
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
 judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
-judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
-judgeval/judges/utils.py,sha256=sYxSJq5cI9LtyJaxurcW9IwngALC9Ty8F_Mb8gz81nE,2732
+judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
+judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
 judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
-judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
-judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
+judgeval/scorers/api_scorer.py,sha256=PPpropMg_vFyUZULWqRPhtz_h2-NVydBMNnGtRpGk4E,2135
+judgeval/scorers/base_scorer.py,sha256=lz3QWPQQIbtsA-TWUjXYYRfQ96uCaAzqxt7Dn4TJa4s,1800
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
 judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
@@ -53,7 +54,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__ini
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py,sha256=PDThn6SzqxgMXT7BpQs2TEBOsgfD5fi6fnKk31qaCTo,10227
 judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py,sha256=5B_G7PPEsfLq6cwWkKWcLuy2k_5RgoOzsW3wOZLIeMk,6703
 judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py,sha256=r6yae5iaWtlBL_cP8I-1SuhS9dulsy1e7W9Rcz82v6E,169
-judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=QG-oxa6-c74VzTuni17RQ9aeT0t1lCuxQXDMznqX8rc,10714
+judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py,sha256=ipER9zyJLq0NqcmxYwfDhavp4rUDYIaDbghR1R0YpaU,10688
 judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py,sha256=GfbKv595s1a0dB1No_kDsap6gfcr6dYRGiXx0PDb89k,6557
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py,sha256=J6tc-T60AVOEaNVuoVU0XIG6dvQri99Q0tnX_Tm-0vc,108
 judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py,sha256=tRgRyjGpc4Pe3nQ1c-5NeNYFvbulL7YEnoRa9zLp1gc,9649
@@ -78,7 +79,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
 judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
-judgeval-0.0.14.dist-info/METADATA,sha256=ZmCAECDNWwzpuES1slYKWcY_U-SMOsjaOdtSoj6wu0I,1283
-judgeval-0.0.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.14.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.14.dist-info/RECORD,,
+judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
+judgeval-0.0.16.dist-info/METADATA,sha256=EQHgzBDEctzLCN0SpLC1m53Z8xVlKZvE0sVMPYkK7yc,1283
+judgeval-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.16.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.16.dist-info/RECORD,,

{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.14.dist-info → judgeval-0.0.16.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

judgeval 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl