PyPI - judgeval - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

judgeval 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/cli.py +65 -0
judgeval/common/api/api.py +44 -38
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +8 -9
judgeval/common/tracer/core.py +278 -256
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +2 -1
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +37 -8
judgeval/data/trace.py +1 -0
judgeval/data/trace_run.py +0 -2
judgeval/integrations/langgraph.py +2 -1
judgeval/judgment_client.py +102 -47
judgeval/local_eval_queue.py +3 -5
judgeval/run_evaluation.py +33 -192
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +3 -1
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +23 -21
judgeval-0.6.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -80
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -24,7 +24,7 @@ from judgeval.common.logger import judgeval_logger
 if TYPE_CHECKING:
     from judgeval.common.tracer import Tracer
     from judgeval.data.trace_run import TraceRun
-    from judgeval.evaluation_run import EvaluationRun
+    from judgeval.data.evaluation_run import EvaluationRun
     from judgeval.integrations.langgraph import JudgevalCallbackHandler
@@ -140,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
     return results
-def check_experiment_type(
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-    is_trace: bool,
-) -> None:
-    """
-    Checks if the current experiment, if one exists, has the same type (examples of traces)
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_experiment_type(eval_name, project_name, is_trace)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 422:
-            judgeval_logger.error(f"{e.response_json}")
-            raise ValueError(f"{e.response_json}")
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
-def check_eval_run_name_exists(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> None:
-    """
-    Checks if an evaluation run name already exists for a given project.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-    Raises:
-        ValueError: If the evaluation run name already exists
-        JudgmentAPIError: If there's an API error during the check
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_eval_run_name_exists(eval_name, project_name)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 409:
-            error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
-            judgeval_logger.error(error_str)
-            raise ValueError(error_str)
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def check_example_keys(
-    keys: List[str],
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-) -> None:
-    """
-    Checks if the current experiment (if one exists) has the same keys for example
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_example_keys(keys, eval_name, project_name)
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
     run: Union[EvaluationRun, TraceRun],
@@ -285,29 +211,10 @@ def check_examples(
 def run_trace_eval(
     trace_run: TraceRun,
     judgment_api_key: str,
-    override: bool = False,
     function: Optional[Callable] = None,
     tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
     examples: Optional[List[Example]] = None,
 ) -> List[ScoringResult]:
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not trace_run.append:
-        check_eval_run_name_exists(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-        )
-    if trace_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples or traces)
-        check_experiment_type(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-            True,
-        )
     if function and tracer and examples is not None:
         new_traces: List[Trace] = []
@@ -376,43 +283,8 @@ def run_trace_eval(
     return scoring_results
-async def get_evaluation_status(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> Dict:
-    """
-    Gets the status of an async evaluation run.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-        organization_id (str): Organization ID for the evaluation
-    Returns:
-        Dict: Status information including:
-            - status: 'pending', 'running', 'completed', or 'failed'
-            - results: List of ScoringResult objects if completed
-            - error: Error message if failed
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        return api_client.get_evaluation_status(eval_name, project_name)
-    except Exception as e:
-        raise JudgmentAPIError(
-            f"An error occurred while checking evaluation status: {str(e)}"
-        )
-def retrieve_counts(result: Dict):
-    scorer_data_count = 0
-    for example in result.get("examples", []):
-        for scorer in example.get("scorer_data", []):
-            scorer_data_count += 1
-    return scorer_data_count
 def _poll_evaluation_until_complete(
-    eval_name: str,
+    experiment_run_id: str,
     project_name: str,
     judgment_api_key: str,
     organization_id: str,
@@ -443,14 +315,16 @@ def _poll_evaluation_until_complete(
         poll_count += 1
         try:
             # Check status
-            status_response = api_client.get_evaluation_status(eval_name, project_name)
+            status_response = api_client.get_evaluation_status(
+                experiment_run_id, project_name
+            )
             if status_response.get("status") != "completed":
                 time.sleep(poll_interval_seconds)
                 continue
             results_response = api_client.fetch_evaluation_results(
-                project_name, eval_name
+                experiment_run_id, project_name
             )
             url = results_response.get("ui_results_url")
@@ -513,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
     evaluation_run: EvaluationRun,
     judgment_api_key: str,
-    override: bool = False,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
-        override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -534,52 +406,31 @@ def run_eval(
                 f"All examples must have the same keys: {current_keys} != {keys}"
             )
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not evaluation_run.append:
-        check_eval_run_name_exists(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-        )
-    if evaluation_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of traces)
-        check_experiment_type(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-            False,
-        )
-        # Ensure that current experiment (if one exists) has the same keys for example
-        check_example_keys(
-            keys=list(keys),
-            eval_name=evaluation_run.eval_name,
-            project_name=evaluation_run.project_name,
-            judgment_api_key=judgment_api_key,
-            organization_id=evaluation_run.organization_id,
-        )
-    judgment_scorers: List[APIScorerConfig] = []
-    local_scorers: List[BaseScorer] = []
-    for scorer in evaluation_run.scorers:
-        if isinstance(scorer, APIScorerConfig):
-            judgment_scorers.append(scorer)
-        else:
-            local_scorers.append(scorer)
     results: List[ScoringResult] = []
     url = ""
-    if len(local_scorers) > 0 and len(judgment_scorers) > 0:
+    if (
+        len(evaluation_run.custom_scorers) > 0
+        and len(evaluation_run.judgment_scorers) > 0
+    ):
         error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
         judgeval_logger.error(error_msg)
         raise ValueError(error_msg)
-    if len(judgment_scorers) > 0:
-        check_examples(evaluation_run.examples, judgment_scorers)
+    e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
+    if evaluation_run.judgment_scorers or e2b_scorers:
+        if evaluation_run.judgment_scorers and e2b_scorers:
+            error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        if len(e2b_scorers) > 1:
+            error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
         stop_event = threading.Event()
         t = threading.Thread(
             target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -600,36 +451,26 @@ def run_eval(
                 )
                 raise JudgmentAPIError(error_message)
-            old_scorer_data_count = 0
-            if evaluation_run.append:
-                try:
-                    results_response = api_client.fetch_evaluation_results(
-                        evaluation_run.project_name, evaluation_run.eval_name
-                    )
-                    old_scorer_data_count = retrieve_counts(results_response)
-                except Exception:
-                    # This usually means the user did append = True but the eval run name doesn't exist yet
-                    pass
+            num_scorers = (
+                len(evaluation_run.judgment_scorers)
+                if evaluation_run.judgment_scorers
+                else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
+            )
             results, url = _poll_evaluation_until_complete(
-                eval_name=evaluation_run.eval_name,
+                experiment_run_id=evaluation_run.id,
                 project_name=evaluation_run.project_name,
                 judgment_api_key=judgment_api_key,
                 organization_id=evaluation_run.organization_id,
-                expected_scorer_data_count=(
-                    len(evaluation_run.scorers) * len(evaluation_run.examples)
-                )
-                + old_scorer_data_count,
+                expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
             )
         finally:
             stop_event.set()
             t.join()
-    if len(local_scorers) > 0:
+    else:
         results = safe_run_async(
             a_execute_scoring(
                 evaluation_run.examples,
-                local_scorers,
+                evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
                 max_concurrent=MAX_CONCURRENT_EVALUATIONS,

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
     name: Optional[str] = (
         None  # name of your scorer (Faithfulness, PromptScorer-randomslug)
     )
+    class_name: Optional[str] = None  # The name of the class of the scorer
     score: Optional[float] = None  # The float score of the scorer run on the test case
     score_breakdown: Optional[Dict] = None
     reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
     error: Optional[str] = None  # The error message if the scorer failed
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
     user: Optional[str] = None  # The user ID of the scorer
+    server_hosted: bool = False  # Whether the scorer is enabled for e2b
-    @model_validator(mode="before")
+    @model_validator(mode="after")
     @classmethod
-    def enforce_strict_threshold(cls, data: dict):
-        if data.get("strict_mode"):
-            data["threshold"] = 1.0
+    def enforce_strict_threshold(cls, data: "BaseScorer"):
+        if data.strict_mode:
+            data.threshold = 1.0
         return data
     @model_validator(mode="after")
     @classmethod
     def default_name(cls, m: "BaseScorer") -> "BaseScorer":
+        # Always set class_name to the string name of the class
+        m.class_name = m.__class__.__name__
         if not m.name:
-            # Try to use the class name if it exists and is not empty
-            class_name = getattr(m, "__class__", None)
-            if class_name and getattr(m.__class__, "__name__", None):
-                m.name = m.__class__.__name__
-            else:
-                m.name = m.score_type
+            m.name = m.class_name
         return m
     def _add_model(self, model: str):

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
 def push_prompt_scorer(
     name: str,
     prompt: str,
+    threshold: float,
     options: Optional[Dict[str, float]] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ) -> str:
     client = JudgmentApiClient(judgment_api_key, organization_id)
     try:
-        r = client.save_scorer(name, prompt, options)
+        r = client.save_scorer(name, prompt, threshold, options)
     except JudgmentAPIException as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
         return cls(
             name=name,
             prompt=scorer_config["prompt"],
+            threshold=scorer_config["threshold"],
             options=scorer_config.get("options"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
         cls,
         name: str,
         prompt: str,
+        threshold: Optional[float] = 0.5,
         options: Optional[Dict[str, float]] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
         if not scorer_exists(name, judgment_api_key, organization_id):
-            push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
+            push_prompt_scorer(
+                name, prompt, threshold, options, judgment_api_key, organization_id
+            )
             judgeval_logger.info(f"Successfully created PromptScorer: {name}")
             return cls(
                 name=name,
                 prompt=prompt,
+                threshold=threshold,
                 options=options,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
         judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
     # Getters
+    def get_threshold(self) -> float | None:
+        """
+        Returns the threshold of the scorer.
+        """
+        return self.threshold
     def get_prompt(self) -> str | None:
         """
         Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
         return {
             "name": self.name,
             "prompt": self.prompt,
+            "threshold": self.threshold,
             "options": self.options,
         }
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
         push_prompt_scorer(
             self.name,
             self.prompt,
+            self.threshold,
             self.options,
             self.judgment_api_key,
             self.organization_id,
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
+        return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)

{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.5.0
+Version: 0.6.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: boto3
+Requires-Dist: click<8.2.0
 Requires-Dist: langchain-anthropic
 Requires-Dist: langchain-core
 Requires-Dist: langchain-huggingface
@@ -23,6 +24,7 @@ Requires-Dist: orjson>=3.9.0
 Requires-Dist: python-dotenv
 Requires-Dist: requests
 Requires-Dist: rich
+Requires-Dist: typer>=0.9.0
 Provides-Extra: langchain
 Requires-Dist: langchain-anthropic; extra == 'langchain'
 Requires-Dist: langchain-core; extra == 'langchain'

{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/RECORD RENAMED Viewed

@@ -1,43 +1,44 @@
 judgeval/__init__.py,sha256=5Lm1JMYFREJGN_8X-Wpruu_ovwGLJ08gCzNAt-u-pQE,419
+judgeval/cli.py,sha256=IcL4_bGr9CtEeea1-AFqM_TEV_VomDlArlxh4IomiSQ,1754
 judgeval/clients.py,sha256=HHul68PV1om0dxsVZZu90TtCiy5zaqAwph16jXTQzQo,989
 judgeval/constants.py,sha256=UNoTLHgbpZHRInPM2ZaI3m0XokPkee5ILlg20reqhzo,4180
 judgeval/dataset.py,sha256=vOrDKam2I-K1WcVF5IBkQruCDvXTc8PRaFm4-dV0lXs,6220
-judgeval/evaluation_run.py,sha256=FJpnc1sGncmAOAnEUO0n2vNXjlycljGqBdV99qPT5og,3087
-judgeval/judgment_client.py,sha256=tGhENRb2YVIe2WUlcssC8DuEijeUC7Ajj_rh_Dh7bzA,11878
-judgeval/local_eval_queue.py,sha256=fAI0_OlvCr-WOCQWw18C4JIRJHKYzlyGzsGUm8LcsYE,7076
+judgeval/judgment_client.py,sha256=-7xcBFowzXKedMINwfZCOL4FKucECWPNEY9QVMo_cys,13644
+judgeval/local_eval_queue.py,sha256=GmlXeZt7bfAJe1hPUjDg_irth4RkNqL2Zdi7VzboBzI,6984
 judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
-judgeval/run_evaluation.py,sha256=4kcaw3R_akhxqutGFGTaBS2pqD-3d0ET7zMDL1_7HK4,27741
+judgeval/run_evaluation.py,sha256=gs-_v_i95LKlJj95G2RmQXvIyBfoldnd1pWCNO4UqsM,21985
 judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
 judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
 judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
 judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
 judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
 judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
-judgeval/common/api/api.py,sha256=uuLH6veC0LewfZ1IFiiUi5_OV7zTa7xTIK9LRlLoufc,13743
-judgeval/common/api/constants.py,sha256=DXej0m8HEhb871SdiR8t_o4fzeMoQjHYqb_X0Plj8wY,4577
-judgeval/common/api/json_encoder.py,sha256=XsScZe9hZP56yuxQ-3Ox6K8DcbjWxc2Yq7FcLF9qkUE,5852
+judgeval/common/api/api.py,sha256=fWtMNln0o1wOhJ9wangWpyY_j3WF7P3at_LYPJEicP0,13670
+judgeval/common/api/constants.py,sha256=y0BDcQqHBZ7MwLd4gT5hLUF8UMs_GVwsJGC-ibfxCAw,4698
+judgeval/common/api/json_encoder.py,sha256=QQgCe2FBmW1uWKx8yvuhr4U7_b4D0sG97GZtXHKnBdk,5881
 judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
 judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
 judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
 judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
-judgeval/common/tracer/core.py,sha256=rI7P0CaarP5FLQZmOGWpOJkjdf6WUgSds6i_QF04J3M,85071
+judgeval/common/tracer/core.py,sha256=TQ80NODaJx7gzmntevDLA3evVJ3m2Zy2s0Pwd7APG9Y,84867
 judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
-judgeval/common/tracer/otel_span_processor.py,sha256=W7SM62KnxJ48vC9WllIHRKaLlvxkCwqYoT4KqZLfGNs,6497
+judgeval/common/tracer/otel_span_processor.py,sha256=BD-FKXaZft5_3zqy1Qe_tpkudVOLop9AGhBjZUgp-Z8,6502
 judgeval/common/tracer/providers.py,sha256=3c3YOtKuoBjlTL0rc2HAGnUpppqvsyzrN5H6EKCqEi0,2733
-judgeval/common/tracer/span_processor.py,sha256=eFjTgSWSkM6BWE94CrvgafDg_WkxLsFL_MafwBG-p9M,1145
-judgeval/common/tracer/span_transformer.py,sha256=mUmfUYjEekUEOXAZMmH0WEF94ge05EBi5ftSc-T91zQ,7314
+judgeval/common/tracer/span_processor.py,sha256=1NQxNSVWcb8qCFLmslSVMnaWdkOZmiFJnxeeN0i6vnU,1150
+judgeval/common/tracer/span_transformer.py,sha256=cfzz6RpTCOG9Io9knNlwtAW34p3wyK-u8jSNMu24p1w,7382
 judgeval/common/tracer/trace_manager.py,sha256=ltiXcWC-68DRc8uSa28qHiWRSIBf6NpYOPkZYooR8tg,3086
 judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
+judgeval/data/evaluation_run.py,sha256=IirmYZ1_9N99eep7DDuoyshwjmpNK9bQCxCWXnnhhuI,4053
 judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
-judgeval/data/judgment_types.py,sha256=1DTpCnIdDM93Rozu9Dr812Q5K3lZfawMcWbPG2ofbxM,8407
+judgeval/data/judgment_types.py,sha256=3nGCUZ1YJhXajhFlAQvax0SOJ8eLuORtquwwjMreJFw,9826
 judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
 judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
 judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
-judgeval/data/trace.py,sha256=LG-IZksynC1VgfUBuBfIIfR1DT9Bn-sY4vIj6Rc9K6Q,2791
-judgeval/data/trace_run.py,sha256=ZCAzktgOSUPD0p1XQj8qGcF-DdsdQFNZM2dtY0aKGbE,1657
+judgeval/data/trace.py,sha256=S781vVU1BvQ_kTS3s7UGYdmYVVxVGjDzWJHZpHedyf0,2834
+judgeval/data/trace_run.py,sha256=Oo1vDrJYX_itt4tt7PJf7fNKd0HE3fnBJxuIkRY8Wrg,1585
 judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
 judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
-judgeval/integrations/langgraph.py,sha256=kJXLsgBY7DgsUTZyVQ47deDgHm887brFHfyIbuyerGw,29986
+judgeval/integrations/langgraph.py,sha256=XsTNpKvXZmSf4TJBtRKSd5AB7S-Td9GTG5wZW9Npj6k,30062
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
 judgeval/judges/litellm_judge.py,sha256=K9yCGOmozt7sYO0u8CHWyZNi8mXnSR3pPkP8yVsvuRc,2561
@@ -47,7 +48,7 @@ judgeval/judges/utils.py,sha256=_t6oYN9q63wyP7D4jI8X0bNmvVw7OfaE7uMTYDVS14E,2782
 judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
 judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
 judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
-judgeval/scorers/base_scorer.py,sha256=eDfQk8N8TQfM1ayJDWr0NTdSQxcbk9-VZHd0Igb9EbI,2878
+judgeval/scorers/base_scorer.py,sha256=hKrLLh2DaxTgAfze8p_IapvsrogRCevYgfaNCDeOJzc,2869
 judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
 judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
 judgeval/scorers/score.py,sha256=SWyoqOOvyLpLy39tLyb_Q94sdh9r_IuDv6YNREw52lg,7546
@@ -61,7 +62,7 @@ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBd
 judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
 judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
 judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
-judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=nx73DeoVkSqJTP1hYxMsJobG9HVWgMDN5-xFOXt_8Ts,7348
+judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=1FsUGjQu3oa2rF-oqt32j-yA2YM33_trGTJ0HgagFJ0,7793
 judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
 judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
 judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
@@ -69,7 +70,8 @@ judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
 judgeval/utils/async_utils.py,sha256=uNx1SopEc0quSjc8GBQqyba0SmCMAzv2NKIq6xYwttc,989
 judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
 judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
-judgeval-0.5.0.dist-info/METADATA,sha256=wwnunL-UcNKbB7D5t-UnOM_x3DVghU2BBPAVxa0tNfo,10348
-judgeval-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.5.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.5.0.dist-info/RECORD,,
+judgeval-0.6.0.dist-info/METADATA,sha256=CulXMs0v5YrHjR3ntVX8xWKcZyxwEpo_nOYs_hkaeN8,10403
+judgeval-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.6.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
+judgeval-0.6.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.6.0.dist-info/RECORD,,

judgeval-0.6.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ judgeval = judgeval.cli:app

judgeval/evaluation_run.py DELETED Viewed

@@ -1,80 +0,0 @@
-from typing import List, Optional, Union
-from pydantic import BaseModel, field_validator, Field
-from judgeval.data import Example
-from judgeval.scorers import BaseScorer, APIScorerConfig
-from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
-class EvaluationRun(BaseModel):
-    """
-    Stores example and evaluation scorers together for running an eval task
-    Args:
-        project_name (str): The name of the project the evaluation results belong to
-        eval_name (str): A name for this evaluation run
-        examples (List[Example]): The examples to evaluate
-        scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
-        model (str): The model used as a judge when using LLM as a Judge
-        metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
-    """
-    organization_id: Optional[str] = None
-    project_name: Optional[str] = Field(default=None, validate_default=True)
-    eval_name: Optional[str] = Field(default=None, validate_default=True)
-    examples: List[Example]
-    scorers: List[Union[APIScorerConfig, BaseScorer]]
-    model: Optional[str] = DEFAULT_GPT_MODEL
-    trace_span_id: Optional[str] = None
-    trace_id: Optional[str] = None
-    # API Key will be "" until user calls client.run_eval(), then API Key will be set
-    override: Optional[bool] = False
-    append: Optional[bool] = False
-    def model_dump(self, **kwargs):
-        data = super().model_dump(**kwargs)
-        data["scorers"] = [
-            scorer.model_dump() for scorer in self.scorers
-        ]  # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
-        data["examples"] = [example.model_dump() for example in self.examples]
-        return data
-    @field_validator("examples")
-    def validate_examples(cls, v):
-        if not v:
-            raise ValueError("Examples cannot be empty.")
-        for item in v:
-            if not isinstance(item, Example):
-                raise ValueError(f"Item of type {type(item)} is not a Example")
-        return v
-    @field_validator("scorers", mode="before")
-    def validate_scorers(cls, v):
-        if not v:
-            raise ValueError("Scorers cannot be empty.")
-        if not all(
-            isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
-            for scorer in v
-        ):
-            raise ValueError(
-                "All scorers must be of type BaseScorer or APIScorerConfig."
-            )
-        return v
-    @field_validator("model")
-    def validate_model(cls, v, values):
-        if not v:
-            raise ValueError("Model cannot be empty.")
-        # Check if model is string or list of strings
-        if isinstance(v, str):
-            if v not in ACCEPTABLE_MODELS:
-                raise ValueError(
-                    f"Model name {v} not recognized. Please select a valid model name.)"
-                )
-            return v
-    class Config:
-        arbitrary_types_allowed = True

{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

judgeval 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl