PyPI - judgeval - Versions diffs - 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

judgeval 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

judgeval/__init__.py +5 -5
judgeval/api/api_types.py +81 -12
judgeval/cli.py +2 -1
judgeval/constants.py +0 -6
judgeval/data/evaluation_run.py +7 -8
judgeval/data/judgment_types.py +97 -12
judgeval/data/trace.py +108 -1
judgeval/dataset/__init__.py +72 -23
judgeval/env.py +5 -20
judgeval/integrations/langgraph/__init__.py +9 -785
judgeval/scorers/__init__.py +6 -0
judgeval/scorers/api_scorer.py +15 -12
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
judgeval/scorers/score.py +1 -1
judgeval/scorers/utils.py +1 -4
judgeval/tracer/__init__.py +181 -162
judgeval/tracer/exporters/__init__.py +4 -1
judgeval/tracer/keys.py +15 -25
judgeval/tracer/llm/__init__.py +0 -1
judgeval/tracer/llm/anthropic/__init__.py +20 -0
judgeval/tracer/llm/google/__init__.py +21 -0
judgeval/tracer/llm/groq/__init__.py +20 -0
judgeval/tracer/llm/openai/__init__.py +32 -0
judgeval/tracer/llm/providers.py +28 -79
judgeval/tracer/llm/together/__init__.py +20 -0
judgeval/tracer/managers.py +23 -48
judgeval/tracer/processors/__init__.py +36 -75
judgeval/tracer/utils.py +3 -4
judgeval/trainer/trainer.py +4 -4
judgeval/utils/file_utils.py +0 -2
judgeval/utils/meta.py +18 -5
judgeval/utils/testing.py +0 -14
judgeval/utils/version_check.py +2 -0
judgeval/version.py +1 -1
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from judgeval.scorers.api_scorer import (
     APIScorerConfig,
+    ExampleAPIScorerConfig,
+    TraceAPIScorerConfig,
 )
 from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
     FaithfulnessScorer,
     AnswerRelevancyScorer,
@@ -13,7 +16,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
 __all__ = [
     "APIScorerConfig",
+    "ExampleAPIScorerConfig",
+    "TraceAPIScorerConfig",
     "BaseScorer",
+    "ExampleScorer",
     "TracePromptScorer",
     "PromptScorer",
     "FaithfulnessScorer",

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -8,8 +8,9 @@ from __future__ import annotations
 from pydantic import BaseModel, field_validator
 from typing import List
-from judgeval.constants import UNBOUNDED_SCORERS, APIScorerType
+from judgeval.constants import APIScorerType
 from judgeval.data.example import ExampleParams
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 class APIScorerConfig(BaseModel):
@@ -29,8 +30,8 @@ class APIScorerConfig(BaseModel):
     name: str = ""
     threshold: float = 0.5
     strict_mode: bool = False
+    model: str = JUDGMENT_DEFAULT_GPT_MODEL
-    # This is used to check if the example has the required parameters before running the scorer
     required_params: List[ExampleParams] = []
     kwargs: dict = {}
@@ -42,16 +43,10 @@ class APIScorerConfig(BaseModel):
         Validates that the threshold is between 0 and 1 inclusive.
         """
         score_type = info.data.get("score_type")
-        if score_type in UNBOUNDED_SCORERS:
-            if v < 0:
-                raise ValueError(
-                    f"Threshold for {score_type} must be greater than 0, got: {v}"
-                )
-        else:
-            if not 0 <= v <= 1:
-                raise ValueError(
-                    f"Threshold for {score_type} must be between 0 and 1, got: {v}"
-                )
+        if not 0 <= v <= 1:
+            raise ValueError(
+                f"Threshold for {score_type} must be between 0 and 1, got: {v}"
+            )
         return v
     @field_validator("name", mode="after")
@@ -63,3 +58,11 @@ class APIScorerConfig(BaseModel):
     def __str__(self):
         return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
+class ExampleAPIScorerConfig(APIScorerConfig):
+    pass
+class TraceAPIScorerConfig(APIScorerConfig):
+    pass

judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py CHANGED Viewed

@@ -1,18 +1,10 @@
-"""
-`judgeval` answer relevancy scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class AnswerCorrectnessScorer(APIScorerConfig):
+class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
     score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class AnswerRelevancyScorer(APIScorerConfig):
+class AnswerRelevancyScorer(ExampleAPIScorerConfig):
     score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py CHANGED Viewed

@@ -1,18 +1,10 @@
-"""
-`judgeval` faithfulness scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
 from typing import List
-class FaithfulnessScorer(APIScorerConfig):
+class FaithfulnessScorer(ExampleAPIScorerConfig):
     score_type: APIScorerType = APIScorerType.FAITHFULNESS
     required_params: List[ExampleParams] = [
         ExampleParams.INPUT,

judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py CHANGED Viewed

@@ -1,17 +1,9 @@
-"""
-`judgeval` instruction adherence scorer
-TODO add link to docs page for this scorer
-"""
-# Internal imports
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
 from judgeval.constants import APIScorerType
 from judgeval.data import ExampleParams
-class InstructionAdherenceScorer(APIScorerConfig):
+class InstructionAdherenceScorer(ExampleAPIScorerConfig):
     def __init__(self, threshold: float):
         super().__init__(
             threshold=threshold,
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(APIScorerConfig):
                 ExampleParams.ACTUAL_OUTPUT,
             ],
         )
-    @property
-    def __name__(self):
-        return "Instruction Adherence"

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -1,24 +1,26 @@
 from judgeval.scorers.api_scorer import (
     APIScorerConfig,
+    ExampleAPIScorerConfig,
+    TraceAPIScorerConfig,
 )
 from judgeval.constants import APIScorerType
-from typing import Dict, Any, Optional
+from typing import Dict, Any
 from judgeval.api import JudgmentSyncClient
 from judgeval.exceptions import JudgmentAPIError
 import os
-from copy import copy
 from judgeval.logger import judgeval_logger
 from abc import ABC
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 def push_prompt_scorer(
     name: str,
     prompt: str,
     threshold: float,
-    options: Optional[Dict[str, float]] = None,
+    model: str = JUDGMENT_DEFAULT_GPT_MODEL,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
-    is_trace: Optional[bool] = None,
+    is_trace: bool = False,
 ) -> str:
     client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
@@ -27,7 +29,7 @@ def push_prompt_scorer(
                 "name": name,
                 "prompt": prompt,
                 "threshold": threshold,
-                "options": options,
+                "model": model,
                 "is_trace": is_trace,
             }
         )
@@ -94,17 +96,8 @@ def scorer_exists(
 class BasePromptScorer(ABC, APIScorerConfig):
-    """
-    In the Judgment backend, this scorer is implemented as a PromptScorer that takes
-    1. a system role that may involve the Example object
-    2. options for scores on the example
-    and uses a judge to execute the evaluation from the system role and classify into one of the options
-    """
     score_type: APIScorerType
     prompt: str
-    options: Optional[Dict[str, float]] = None
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
@@ -131,7 +124,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
             name=name,
             prompt=scorer_config["prompt"],
             threshold=scorer_config["threshold"],
-            options=scorer_config.get("options"),
+            model=scorer_config.get("model"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
         )
@@ -142,7 +135,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
         name: str,
         prompt: str,
         threshold: float = 0.5,
-        options: Optional[Dict[str, float]] = None,
+        model: str = JUDGMENT_DEFAULT_GPT_MODEL,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
@@ -157,7 +150,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 name,
                 prompt,
                 threshold,
-                options,
+                model,
                 judgment_api_key,
                 organization_id,
                 is_trace,
@@ -168,7 +161,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 name=name,
                 prompt=prompt,
                 threshold=threshold,
-                options=options,
+                model=model,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
             )
@@ -198,16 +191,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
         self.push_prompt_scorer()
         judgeval_logger.info(f"Successfully updated prompt for {self.name}")
-    def set_options(self, options: Dict[str, float]):
+    def set_model(self, model: str):
         """
-        Updates the options with the new options.
-        Sample options:
-        {"yes": 1, "no": 0}
+        Updates the model of the scorer.
         """
-        self.options = options
+        self.model = model
         self.push_prompt_scorer()
-        judgeval_logger.info(f"Successfully updated options for {self.name}")
+        judgeval_logger.info(f"Successfully updated model for {self.name}")
     def append_to_prompt(self, prompt_addition: str):
         """
@@ -218,23 +208,23 @@ class BasePromptScorer(ABC, APIScorerConfig):
         judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
     # Getters
-    def get_threshold(self) -> float | None:
+    def get_threshold(self) -> float:
         """
         Returns the threshold of the scorer.
         """
         return self.threshold
-    def get_prompt(self) -> str | None:
+    def get_prompt(self) -> str:
         """
         Returns the prompt of the scorer.
         """
         return self.prompt
-    def get_options(self) -> Dict[str, float] | None:
+    def get_model(self) -> str:
         """
-        Returns the options of the scorer.
+        Returns the model of the scorer.
         """
-        return copy(self.options) if self.options is not None else None
+        return self.model
     def get_name(self) -> str | None:
         """
@@ -248,9 +238,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
         """
         return {
             "name": self.name,
+            "model": self.model,
             "prompt": self.prompt,
             "threshold": self.threshold,
-            "options": self.options,
         }
     def push_prompt_scorer(self):
@@ -261,13 +251,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
             self.name,
             self.prompt,
             self.threshold,
-            self.options,
+            self.model,
             self.judgment_api_key,
             self.organization_id,
+            isinstance(self, TracePromptScorer),
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
+        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)
@@ -282,9 +273,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
         return base
-class PromptScorer(BasePromptScorer, APIScorerConfig):
+class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
     pass
-class TracePromptScorer(BasePromptScorer, APIScorerConfig):
+class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
     pass

judgeval/scorers/score.py CHANGED Viewed

@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: Union[ExampleScorer],
+    scorer: ExampleScorer,
     example: Example,
 ):
     """

judgeval/scorers/utils.py CHANGED Viewed

@@ -11,7 +11,4 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
     """
     Creates duplicates of the scorers passed as argument.
     """
-    cloned_scorers = []
-    for s in scorers:
-        cloned_scorers.append(s.model_copy(deep=True))
-    return cloned_scorers
+    return [s.model_copy(deep=True) for s in scorers]

judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

judgeval 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl