PyPI - judgeval - Versions diffs - 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

judgeval 0.7.1py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

judgeval/__init__.py +139 -12
judgeval/api/__init__.py +501 -0
judgeval/api/api_types.py +344 -0
judgeval/cli.py +2 -4
judgeval/constants.py +10 -26
judgeval/data/evaluation_run.py +49 -26
judgeval/data/example.py +2 -2
judgeval/data/judgment_types.py +266 -82
judgeval/data/result.py +4 -5
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +2 -2
judgeval/data/trace.py +7 -50
judgeval/data/trace_run.py +7 -4
judgeval/{dataset.py → dataset/__init__.py} +43 -28
judgeval/env.py +67 -0
judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +788 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +75 -15
judgeval/judges/together_judge.py +86 -18
judgeval/judges/utils.py +7 -21
judgeval/{common/logger.py → logger.py} +8 -6
judgeval/scorers/__init__.py +0 -4
judgeval/scorers/agent_scorer.py +3 -7
judgeval/scorers/api_scorer.py +8 -13
judgeval/scorers/base_scorer.py +52 -32
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
judgeval/scorers/score.py +21 -31
judgeval/scorers/trace_api_scorer.py +5 -0
judgeval/scorers/utils.py +1 -103
judgeval/tracer/__init__.py +1075 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +37 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +43 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +67 -0
judgeval/tracer/llm/__init__.py +1233 -0
judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
judgeval/tracer/managers.py +188 -0
judgeval/tracer/processors/__init__.py +181 -0
judgeval/tracer/utils.py +20 -0
judgeval/trainer/__init__.py +5 -0
judgeval/{common/trainer → trainer}/config.py +12 -9
judgeval/{common/trainer → trainer}/console.py +2 -9
judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
judgeval/{common/trainer → trainer}/trainer.py +119 -17
judgeval/utils/async_utils.py +2 -3
judgeval/utils/decorators.py +24 -0
judgeval/utils/file_utils.py +37 -4
judgeval/utils/guards.py +32 -0
judgeval/utils/meta.py +14 -0
judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
judgeval/utils/testing.py +88 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +3 -3
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
judgeval-0.9.0.dist-info/RECORD +80 -0
judgeval/clients.py +0 -35
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -375
judgeval/common/api/constants.py +0 -186
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -97
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -2427
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -188
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -207
judgeval/common/tracer/trace_manager.py +0 -101
judgeval/common/trainer/__init__.py +0 -5
judgeval/common/utils.py +0 -948
judgeval/integrations/langgraph.py +0 -844
judgeval/judges/mixture_of_judges.py +0 -287
judgeval/judgment_client.py +0 -267
judgeval/rules.py +0 -521
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.7.1.dist-info/RECORD +0 -82
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from judgeval.scorers.api_scorer import APIScorerConfig
 from judgeval.constants import APIScorerType
 from typing import Dict, Any, Optional
-from judgeval.common.api import JudgmentApiClient, JudgmentAPIException
+from judgeval.api import JudgmentSyncClient
+from judgeval.exceptions import JudgmentAPIError
 import os
-from judgeval.common.exceptions import JudgmentAPIError
 from copy import copy
-from judgeval.common.logger import judgeval_logger
+from judgeval.logger import judgeval_logger
 def push_prompt_scorer(
@@ -16,15 +16,28 @@ def push_prompt_scorer(
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ) -> str:
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        r = client.save_scorer(name, prompt, threshold, options)
-    except JudgmentAPIException as e:
+        r = client.save_scorer(
+            payload={
+                "name": name,
+                "prompt": prompt,
+                "threshold": threshold,
+                "options": options,
+            }
+        )
+    except JudgmentAPIError as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
+                status_code=e.status_code,
+                detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
+                response=e.response,
             )
-        raise JudgmentAPIError(f"Failed to save prompt scorer: {e.error_detail}")
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to save prompt scorer: {e.detail}",
+            response=e.response,
+        )
     return r["name"]
@@ -33,19 +46,23 @@ def fetch_prompt_scorer(
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ):
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        scorer_config = client.fetch_scorer(name)["scorer"]
+        scorer_config = client.fetch_scorer({"name": name})["scorer"]
         scorer_config.pop("created_at")
         scorer_config.pop("updated_at")
         return scorer_config
-    except JudgmentAPIException as e:
+    except JudgmentAPIError as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
+                status_code=e.status_code,
+                detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
+                response=e.response,
             )
         raise JudgmentAPIError(
-            f"Failed to fetch prompt scorer '{name}': {e.error_detail}"
+            status_code=e.status_code,
+            detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
+            response=e.response,
         )
@@ -54,15 +71,21 @@ def scorer_exists(
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ):
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        return client.scorer_exists(name)["exists"]
-    except JudgmentAPIException as e:
+        return client.scorer_exists({"name": name})["exists"]
+    except JudgmentAPIError as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
+                status_code=e.status_code,
+                detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
+                response=e.response,
             )
-        raise JudgmentAPIError(f"Failed to check if scorer exists: {e.error_detail}")
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to check if scorer exists: {e.detail}",
+            response=e.response,
+        )
 class PromptScorer(APIScorerConfig):
@@ -102,7 +125,7 @@ class PromptScorer(APIScorerConfig):
         cls,
         name: str,
         prompt: str,
-        threshold: Optional[float] = 0.5,
+        threshold: float = 0.5,
         options: Optional[Dict[str, float]] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
@@ -122,7 +145,9 @@ class PromptScorer(APIScorerConfig):
             )
         else:
             raise JudgmentAPIError(
-                f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name."
+                status_code=400,
+                detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
+                response=None,  # type: ignore
             )
     # Setter functions. Each setter function pushes the scorer to the DB.

judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py CHANGED Viewed

@@ -3,12 +3,12 @@
 """
 # Internal imports
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
 from judgeval.constants import APIScorerType
 from typing import Optional, Dict
-class ToolDependencyScorer(APIScorerConfig):
+class ToolDependencyScorer(TraceAPIScorerConfig):
     kwargs: Optional[Dict] = None
     def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):

judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py CHANGED Viewed

@@ -3,19 +3,19 @@
 """
 # Internal imports
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
 from judgeval.constants import APIScorerType
 from typing import Dict, Any
-class ToolOrderScorer(APIScorerConfig):
+class ToolOrderScorer(TraceAPIScorerConfig):
     score_type: APIScorerType = APIScorerType.TOOL_ORDER
     threshold: float = 1.0
     exact_match: bool = False
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)
-        base_fields = set(APIScorerConfig.model_fields.keys())
+        base_fields = set(TraceAPIScorerConfig.model_fields.keys())
         all_fields = set(self.__class__.model_fields.keys())
         extra_fields = all_fields - base_fields - {"kwargs"}

judgeval/scorers/score.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Infrastructure for executing evaluations of `Example`s using one or more `BaseScorer`s.
+Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
 """
 import asyncio
@@ -13,23 +13,23 @@ from judgeval.data import (
     generate_scoring_result,
     create_scorer_data,
 )
-from judgeval.scorers import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.scorers.utils import clone_scorers
-from judgeval.common.logger import judgeval_logger
+from judgeval.logger import judgeval_logger
 from judgeval.judges import JudgevalJudge
-from judgeval.constants import DEFAULT_GPT_MODEL
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: BaseScorer,
+    scorer: ExampleScorer,
     example: Example,
 ):
     """
     Scoring task function when not using a progress indicator!
-    "Safely" scores an `Example` using a `BaseScorer` by gracefully handling any exceptions that may occur.
+    "Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
     Args:
-        scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
+        scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
     """
     try:
@@ -55,20 +55,20 @@ async def safe_a_score_example(
 async def a_execute_scoring(
     examples: List[Example],
-    scorers: List[BaseScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = DEFAULT_GPT_MODEL,
+    scorers: List[ExampleScorer],
+    model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
     ignore_errors: bool = False,
     throttle_value: int = 0,
     max_concurrent: int = 100,
     show_progress: bool = True,
 ) -> List[ScoringResult]:
     """
-    Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
-    Each `Example` will be evaluated by all of the `BaseScorer`s in the `scorers` list.
+    Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
+    Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
     Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
-        scorers (List[BaseScorer]): A list of `BaseScorer` objects to evaluate the examples.
+        scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
         model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
         throttle_value (int): The amount of time to wait between starting each task.
@@ -88,19 +88,15 @@ async def a_execute_scoring(
             except Exception as e:
                 judgeval_logger.error(f"Error executing function: {e}")
                 if kwargs.get("ignore_errors", False):
-                    # Simply return None when ignoring errors, as expected by the test
                     return None
-                # If we're not ignoring errors, propagate the exception
                 raise
-    # Add model to scorers
     for scorer in scorers:
-        if not scorer.model:
+        if not scorer.model and isinstance(model, str):
             scorer._add_model(model)
-    scoring_results: List[ScoringResult] = [None for _ in examples]
+    scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
     tasks = []
-    cloned_scorers: List[BaseScorer]
     if show_progress:
         with tqdm_asyncio(
@@ -115,7 +111,7 @@ async def a_execute_scoring(
                         pbar.update(1)
                         continue
-                    cloned_scorers = clone_scorers(scorers)
+                    cloned_scorers = clone_scorers(scorers)  # type: ignore
                     task = execute_with_semaphore(
                         func=a_eval_examples_helper,
                         scorers=cloned_scorers,
@@ -135,7 +131,7 @@ async def a_execute_scoring(
                 if len(scorers) == 0:
                     continue
-                cloned_scorers = clone_scorers(scorers)
+                cloned_scorers = clone_scorers(scorers)  # type: ignore
                 task = execute_with_semaphore(
                     func=a_eval_examples_helper,
                     scorers=cloned_scorers,
@@ -149,13 +145,13 @@ async def a_execute_scoring(
             await asyncio.sleep(throttle_value)
         await asyncio.gather(*tasks)
-    return scoring_results
+    return [result for result in scoring_results if result is not None]
 async def a_eval_examples_helper(
-    scorers: List[BaseScorer],
+    scorers: List[ExampleScorer],
     example: Example,
-    scoring_results: List[ScoringResult],
+    scoring_results: List[Optional[ScoringResult]],
     score_index: int,
     ignore_errors: bool,
     pbar: Optional[tqdm_asyncio] = None,
@@ -164,7 +160,7 @@ async def a_eval_examples_helper(
     Evaluate a single example asynchronously using a list of scorers.
     Args:
-        scorers (List[BaseScorer]): List of BaseScorer objects to evaluate the example.
+        scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
         scoring_results (List[ScoringResult]): List to store the scoring results.
         score_index (int): Index at which the result should be stored in scoring_results.
@@ -174,24 +170,18 @@ async def a_eval_examples_helper(
         None
     """
-    # scoring the Example
     scoring_start_time = time.perf_counter()
     tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
     await asyncio.gather(*tasks)
-    # Now that all the scoring functions of each scorer have executed, we collect
-    # the results and update the ScoringResult with the scorer data
     success = True
     scorer_data_list = []
     for scorer in scorers:
-        # At this point, the scorer has been executed and already contains data.
         if getattr(scorer, "skipped", False):
             continue
-        scorer_data = create_scorer_data(
-            scorer
-        )  # Fetch scorer data from completed scorer evaluation
+        scorer_data = create_scorer_data(scorer)
         for s in scorer_data:
             success = success and s.success
         scorer_data_list.extend(scorer_data)

judgeval/scorers/trace_api_scorer.py ADDED Viewed

@@ -0,0 +1,5 @@
+from judgeval.scorers.api_scorer import APIScorerConfig
+class TraceAPIScorerConfig(APIScorerConfig):
+    pass

judgeval/scorers/utils.py CHANGED Viewed

@@ -2,15 +2,9 @@
 Util functions for Scorer objects
 """
-import asyncio
-import nest_asyncio
-import orjson
-import re
-from typing import List, Optional
+from typing import List
 from judgeval.scorers import BaseScorer
-from judgeval.data import Example, ExampleParams
-from judgeval.scorers.exceptions import MissingExampleParamsError
 def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
@@ -21,99 +15,3 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
     for s in scorers:
         cloned_scorers.append(s.model_copy(deep=True))
     return cloned_scorers
-def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None) -> dict:
-    """
-    Extracts JSON output from an LLM response and returns it as a dictionary.
-    If the JSON is invalid, the error is forwarded to the `scorer`, if provided.
-    Args:
-        llm_response (str): The response from an LLM.
-        scorer (BaseScorer, optional): The scorer object to forward errors to (if any).
-    """
-    start = llm_response.find("{")  # opening bracket
-    end = llm_response.rfind("}") + 1  # closing bracket
-    if end == 0 and start != -1:  # add the closing bracket if it's missing
-        llm_response = llm_response + "}"
-        end = len(llm_response)
-    json_str = (
-        llm_response[start:end] if start != -1 and end != 0 else ""
-    )  # extract the JSON string
-    json_str = re.sub(
-        r",\s*([\]}])", r"\1", json_str
-    )  # Remove trailing comma if present
-    try:
-        return orjson.loads(json_str)
-    except orjson.JSONDecodeError:
-        error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
-        if scorer is not None:
-            scorer.error = error_str
-        raise ValueError(error_str)
-    except Exception as e:
-        raise Exception(f"An unexpected error occurred: {str(e)}")
-def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
-    """
-    Get or create an asyncio event loop.
-    This function attempts to retrieve the current event loop using `asyncio.get_event_loop()`.
-    If the event loop is already running, it applies the `nest_asyncio` patch to allow nested
-    asynchronous execution. If the event loop is closed or not found, it creates a new event loop
-    and sets it as the current event loop.
-    Returns:
-        asyncio.AbstractEventLoop: The current or newly created event loop.
-    Raises:
-        RuntimeError: If the event loop is closed.
-    """
-    try:
-        loop = asyncio.get_event_loop()
-        if loop.is_running():
-            print(
-                "Event loop is already running. Applying nest_asyncio patch to allow async execution..."
-            )
-            nest_asyncio.apply()
-        if loop.is_closed():
-            raise RuntimeError
-    except RuntimeError:
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-    return loop
-def check_example_params(
-    example: Example,
-    example_params: List[ExampleParams],
-    scorer: BaseScorer,
-):
-    if isinstance(example, Example) is False:
-        error_str = f"in check_example_params(): Expected example to be of type 'Example', but got {type(example)}"
-        scorer.error = error_str
-        raise MissingExampleParamsError(error_str)
-    missing_params = []
-    for param in example_params:
-        if getattr(example, param.value) is None:
-            missing_params.append(f"'{param.value}'")
-    if missing_params:
-        if len(missing_params) == 1:
-            missing_params_str = missing_params[0]
-        elif len(missing_params) == 2:
-            missing_params_str = " and ".join(missing_params)
-        else:
-            missing_params_str = (
-                ", ".join(missing_params[:-1]) + ", and " + missing_params[-1]
-            )
-        error_str = f"{missing_params_str} fields in example cannot be None for the '{scorer.__name__}' scorer"
-        scorer.error = error_str
-        raise MissingExampleParamsError(error_str)

judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

judgeval 0.7.1py3-none-any.whl → 0.9.0py3-none-any.whl