PyPI - judgeval - Versions diffs - 0.0.54__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

judgeval 0.0.54py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

judgeval/common/api/__init__.py +3 -0
judgeval/common/api/api.py +352 -0
judgeval/common/api/constants.py +165 -0
judgeval/common/storage/__init__.py +6 -0
judgeval/common/tracer/__init__.py +31 -0
judgeval/common/tracer/constants.py +22 -0
judgeval/common/tracer/core.py +1916 -0
judgeval/common/tracer/otel_exporter.py +108 -0
judgeval/common/tracer/otel_span_processor.py +234 -0
judgeval/common/tracer/span_processor.py +37 -0
judgeval/common/tracer/span_transformer.py +211 -0
judgeval/common/tracer/trace_manager.py +92 -0
judgeval/common/utils.py +2 -2
judgeval/constants.py +3 -30
judgeval/data/datasets/eval_dataset_client.py +29 -156
judgeval/data/judgment_types.py +4 -12
judgeval/data/result.py +1 -1
judgeval/data/scorer_data.py +2 -2
judgeval/data/scripts/openapi_transform.py +1 -1
judgeval/data/trace.py +66 -1
judgeval/data/trace_run.py +0 -3
judgeval/evaluation_run.py +0 -2
judgeval/integrations/langgraph.py +43 -164
judgeval/judgment_client.py +17 -211
judgeval/run_evaluation.py +209 -611
judgeval/scorers/__init__.py +2 -6
judgeval/scorers/base_scorer.py +4 -23
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
judgeval/scorers/score.py +2 -1
judgeval/scorers/utils.py +1 -13
judgeval/utils/requests.py +21 -0
judgeval-0.1.0.dist-info/METADATA +202 -0
{judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
judgeval/common/tracer.py +0 -3215
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
judgeval-0.0.54.dist-info/METADATA +0 -1384
/judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
{judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
{judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -5,11 +5,7 @@ Implements the JudgmentClient to interact with the Judgment API.
 import os
 from uuid import uuid4
 from typing import Optional, List, Dict, Any, Union, Callable
-from requests import codes
-from judgeval.utils.requests import requests
-import asyncio
-from judgeval.constants import ROOT_API
 from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
@@ -19,40 +15,31 @@ from judgeval.data import (
 from judgeval.scorers import (
     APIScorerConfig,
     BaseScorer,
-    ClassifierScorer,
 )
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
     assert_test,
     run_trace_eval,
-    safe_run_async,
 )
 from judgeval.data.trace_run import TraceRun
-from judgeval.constants import (
-    JUDGMENT_EVAL_FETCH_API_URL,
-    JUDGMENT_PROJECT_DELETE_API_URL,
-    JUDGMENT_PROJECT_CREATE_API_URL,
-)
+from judgeval.common.api import JudgmentApiClient
 from judgeval.common.exceptions import JudgmentAPIError
 from langchain_core.callbacks import BaseCallbackHandler
 from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
-from judgeval.run_evaluation import SpinnerWrappedTask
 from judgeval.common.logger import judgeval_logger
 class EvalRunRequestBody(BaseModel):
     eval_name: str
     project_name: str
-    judgment_api_key: str
 class DeleteEvalRunRequestBody(BaseModel):
     eval_names: List[str]
     project_name: str
-    judgment_api_key: str
 class SingletonMeta(type):
@@ -83,6 +70,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self.judgment_api_key = api_key
         self.organization_id = organization_id
+        self.api_client = JudgmentApiClient(api_key, organization_id)
         self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
         # Verify API key is valid
@@ -93,29 +81,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         else:
             judgeval_logger.info("Successfully initialized JudgmentClient!")
-    def a_run_evaluation(
-        self,
-        examples: List[Example],
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_run",
-        override: bool = False,
-        append: bool = False,
-    ) -> List[ScoringResult]:
-        result = self.run_evaluation(
-            examples=examples,
-            scorers=scorers,
-            model=model,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            append=append,
-            async_execution=True,
-        )
-        assert not isinstance(result, (asyncio.Task, SpinnerWrappedTask))
-        return result
     def run_trace_evaluation(
         self,
         scorers: List[Union[APIScorerConfig, BaseScorer]],
@@ -147,11 +112,12 @@ class JudgmentClient(metaclass=SingletonMeta):
                 scorers=scorers,
                 model=model,
                 append=append,
-                judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id,
                 tools=tools,
             )
-            return run_trace_eval(trace_run, override, function, tracer, examples)
+            return run_trace_eval(
+                trace_run, self.judgment_api_key, override, function, tracer, examples
+            )
         except ValueError as e:
             raise ValueError(
                 f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
@@ -168,8 +134,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         eval_run_name: str = "default_eval_run",
         override: bool = False,
         append: bool = False,
-        async_execution: bool = False,
-    ) -> Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]:
+    ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -181,7 +146,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             eval_run_name (str): A name for this evaluation run
             override (bool): Whether to override an existing evaluation run with the same name
             append (bool): Whether to append to an existing evaluation run with the same name
-            async_execution (bool): Whether to execute the evaluation asynchronously
         Returns:
             List[ScoringResult]: The results of the evaluation
@@ -194,18 +158,18 @@ class JudgmentClient(metaclass=SingletonMeta):
         try:
             eval = EvaluationRun(
                 append=append,
+                override=override,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
                 scorers=scorers,
                 model=model,
-                judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id,
             )
             return run_eval(
                 eval,
+                self.judgment_api_key,
                 override,
-                async_execution=async_execution,
             )
         except ValueError as e:
             raise ValueError(
@@ -292,158 +256,21 @@ class JudgmentClient(metaclass=SingletonMeta):
                 - id (str): The evaluation run ID
                 - results (List[ScoringResult]): List of scoring results
         """
-        eval_run_request_body = EvalRunRequestBody(
-            project_name=project_name,
-            eval_name=eval_run_name,
-            judgment_api_key=self.judgment_api_key,
-        )
-        eval_run = requests.post(
-            JUDGMENT_EVAL_FETCH_API_URL,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-                "X-Organization-Id": self.organization_id,
-            },
-            json=eval_run_request_body.model_dump(),
-            verify=True,
-        )
-        if eval_run.status_code != codes.ok:
-            raise ValueError(f"Error fetching eval results: {eval_run.json()}")
-        return eval_run.json()
+        return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
     def create_project(self, project_name: str) -> bool:
         """
         Creates a project on the server.
         """
-        response = requests.post(
-            JUDGMENT_PROJECT_CREATE_API_URL,
-            json={
-                "project_name": project_name,
-            },
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-                "X-Organization-Id": self.organization_id,
-            },
-        )
-        if response.status_code != codes.ok:
-            raise ValueError(f"Error creating project: {response.json()}")
-        return response.json()
+        self.api_client.create_project(project_name)
+        return True
     def delete_project(self, project_name: str) -> bool:
         """
         Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
         """
-        response = requests.delete(
-            JUDGMENT_PROJECT_DELETE_API_URL,
-            json={
-                "project_name": project_name,
-            },
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-                "X-Organization-Id": self.organization_id,
-            },
-        )
-        if response.status_code != codes.ok:
-            raise ValueError(f"Error deleting project: {response.json()}")
-        return response.json()
-    def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
-        """
-        Fetches a classifier scorer configuration from the Judgment API.
-        Args:
-            slug (str): Slug identifier of the custom scorer to fetch
-        Returns:
-            ClassifierScorer: The configured classifier scorer object
-        Raises:
-            JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
-        """
-        request_body = {
-            "slug": slug,
-        }
-        response = requests.post(
-            f"{ROOT_API}/fetch_scorer/",
-            json=request_body,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-                "X-Organization-Id": self.organization_id,
-            },
-            verify=True,
-        )
-        if response.status_code == 500:
-            raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}"
-            )
-        elif response.status_code != 200:
-            raise JudgmentAPIError(
-                f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}"
-            )
-        scorer_config = response.json()
-        scorer_config.pop("created_at")
-        scorer_config.pop("updated_at")
-        try:
-            return ClassifierScorer(**scorer_config)
-        except Exception as e:
-            raise JudgmentAPIError(
-                f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}"
-            )
-    def push_classifier_scorer(
-        self, scorer: ClassifierScorer, slug: str | None = None
-    ) -> str:
-        """
-        Pushes a classifier scorer configuration to the Judgment API.
-        Args:
-            slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
-            scorer (ClassifierScorer): The classifier scorer to save
-        Returns:
-            str: The slug identifier of the saved scorer
-        Raises:
-            JudgmentAPIError: If there's an error saving the scorer
-        """
-        request_body = {
-            "name": scorer.name,
-            "conversation": scorer.conversation,
-            "options": scorer.options,
-            "slug": slug,
-        }
-        response = requests.post(
-            f"{ROOT_API}/save_scorer/",
-            json=request_body,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}",
-                "X-Organization-Id": self.organization_id,
-            },
-            verify=True,
-        )
-        if response.status_code == 500:
-            raise JudgmentAPIError(
-                f"The server is temporarily unavailable. \
-                                   Please try your request again in a few moments. \
-                                   Error details: {response.json().get('detail', '')}"
-            )
-        elif response.status_code != 200:
-            raise JudgmentAPIError(
-                f"Failed to save classifier scorer: {response.json().get('detail', '')}"
-            )
-        return response.json()["slug"]
+        self.api_client.delete_project(project_name)
+        return True
     def assert_test(
         self,
@@ -454,7 +281,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         eval_run_name: str = str(uuid4()),
         override: bool = False,
         append: bool = False,
-        async_execution: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -470,7 +296,7 @@ class JudgmentClient(metaclass=SingletonMeta):
             async_execution (bool): Whether to run the evaluation asynchronously
         """
-        results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
+        results: List[ScoringResult]
         results = self.run_evaluation(
             examples=examples,
@@ -480,19 +306,8 @@ class JudgmentClient(metaclass=SingletonMeta):
             eval_run_name=eval_run_name,
             override=override,
             append=append,
-            async_execution=async_execution,
         )
-        if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
-            async def run_async():  # Using wrapper here to resolve mypy error with passing Task into asyncio.run
-                return await results
-            actual_results = safe_run_async(run_async())
-            assert_test(actual_results)  # Call the synchronous imported function
-        else:
-            # 'results' is already List[ScoringResult] here (synchronous path)
-            assert_test(results)  # Call the synchronous imported function
+        assert_test(results)
     def assert_trace_test(
         self,
@@ -535,7 +350,7 @@ class JudgmentClient(metaclass=SingletonMeta):
                             f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
                         )
-        results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
+        results: List[ScoringResult]
         results = self.run_trace_evaluation(
             examples=examples,
@@ -551,13 +366,4 @@ class JudgmentClient(metaclass=SingletonMeta):
             tools=tools,
         )
-        if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
-            async def run_async():  # Using wrapper here to resolve mypy error with passing Task into asyncio.run
-                return await results
-            actual_results = safe_run_async(run_async())
-            assert_test(actual_results)  # Call the synchronous imported function
-        else:
-            # 'results' is already List[ScoringResult] here (synchronous path)
-            assert_test(results)  # Call the synchronous imported function
+        assert_test(results)

judgeval 0.0.54__py3-none-any.whl → 0.1.0__py3-none-any.whl

judgeval 0.0.54py3-none-any.whl → 0.1.0py3-none-any.whl