PyPI - judgeval - Versions diffs - 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

judgeval/__init__.py +173 -10
judgeval/api/__init__.py +523 -0
judgeval/api/api_types.py +413 -0
judgeval/cli.py +112 -0
judgeval/constants.py +7 -30
judgeval/data/__init__.py +1 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +14 -40
judgeval/data/judgment_types.py +396 -146
judgeval/data/result.py +11 -18
judgeval/data/scorer_data.py +3 -26
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +115 -194
judgeval/dataset/__init__.py +335 -0
judgeval/env.py +55 -0
judgeval/evaluation/__init__.py +346 -0
judgeval/exceptions.py +28 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +51 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +77 -16
judgeval/judges/together_judge.py +88 -17
judgeval/judges/utils.py +7 -20
judgeval/judgment_attribute_keys.py +55 -0
judgeval/{common/logger.py → logger.py} +24 -8
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +11 -11
judgeval/scorers/agent_scorer.py +15 -19
judgeval/scorers/api_scorer.py +21 -23
judgeval/scorers/base_scorer.py +54 -36
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
judgeval/scorers/score.py +64 -47
judgeval/scorers/utils.py +2 -107
judgeval/tracer/__init__.py +1111 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +123 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +392 -0
judgeval/trainer/trainable_model.py +252 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +74 -28
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +5 -3
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/v1/__init__.py +88 -0
judgeval/v1/data/__init__.py +7 -0
judgeval/v1/data/example.py +44 -0
judgeval/v1/data/scorer_data.py +42 -0
judgeval/v1/data/scoring_result.py +44 -0
judgeval/v1/datasets/__init__.py +6 -0
judgeval/v1/datasets/dataset.py +214 -0
judgeval/v1/datasets/dataset_factory.py +94 -0
judgeval/v1/evaluation/__init__.py +6 -0
judgeval/v1/evaluation/evaluation.py +182 -0
judgeval/v1/evaluation/evaluation_factory.py +17 -0
judgeval/v1/instrumentation/__init__.py +6 -0
judgeval/v1/instrumentation/llm/__init__.py +7 -0
judgeval/v1/instrumentation/llm/config.py +78 -0
judgeval/v1/instrumentation/llm/constants.py +11 -0
judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
judgeval/v1/instrumentation/llm/providers.py +19 -0
judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
judgeval/v1/integrations/langgraph/__init__.py +13 -0
judgeval/v1/integrations/openlit/__init__.py +47 -0
judgeval/v1/internal/api/__init__.py +525 -0
judgeval/v1/internal/api/api_types.py +413 -0
judgeval/v1/prompts/__init__.py +6 -0
judgeval/v1/prompts/prompt.py +29 -0
judgeval/v1/prompts/prompt_factory.py +189 -0
judgeval/v1/py.typed +0 -0
judgeval/v1/scorers/__init__.py +6 -0
judgeval/v1/scorers/api_scorer.py +82 -0
judgeval/v1/scorers/base_scorer.py +17 -0
judgeval/v1/scorers/built_in/__init__.py +17 -0
judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
judgeval/v1/scorers/built_in/faithfulness.py +28 -0
judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
judgeval/v1/scorers/scorers_factory.py +49 -0
judgeval/v1/tracer/__init__.py +7 -0
judgeval/v1/tracer/base_tracer.py +520 -0
judgeval/v1/tracer/exporters/__init__.py +14 -0
judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
judgeval/v1/tracer/exporters/span_store.py +50 -0
judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
judgeval/v1/tracer/processors/__init__.py +6 -0
judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
judgeval/v1/tracer/tracer.py +67 -0
judgeval/v1/tracer/tracer_factory.py +38 -0
judgeval/v1/trainers/__init__.py +5 -0
judgeval/v1/trainers/base_trainer.py +62 -0
judgeval/v1/trainers/config.py +123 -0
judgeval/v1/trainers/console.py +144 -0
judgeval/v1/trainers/fireworks_trainer.py +392 -0
judgeval/v1/trainers/trainable_model.py +252 -0
judgeval/v1/trainers/trainers_factory.py +37 -0
judgeval/v1/utils.py +18 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.23.0.dist-info/METADATA +266 -0
judgeval-0.23.0.dist-info/RECORD +201 -0
judgeval-0.23.0.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -34
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -352
judgeval/common/api/constants.py +0 -165
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -98
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -1916
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -234
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -211
judgeval/common/tracer/trace_manager.py +0 -92
judgeval/common/utils.py +0 -940
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
judgeval/data/tool.py +0 -5
judgeval/data/trace_run.py +0 -37
judgeval/evaluation_run.py +0 -75
judgeval/integrations/langgraph.py +0 -843
judgeval/judges/mixture_of_judges.py +0 -286
judgeval/judgment_client.py +0 -369
judgeval/rules.py +0 -521
judgeval/run_evaluation.py +0 -684
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.1.0.dist-info/METADATA +0 -202
judgeval-0.1.0.dist-info/RECORD +0 -73
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -1,28 +1,51 @@
-from judgeval.scorers.api_scorer import APIScorerConfig
+from judgeval.scorers.api_scorer import (
+    APIScorerConfig,
+    ExampleAPIScorerConfig,
+    TraceAPIScorerConfig,
+)
 from judgeval.constants import APIScorerType
-from typing import Mapping, Dict, Any
-from judgeval.common.api import JudgmentApiClient, JudgmentAPIException
+from typing import Dict, Any, Optional
+from judgeval.api import JudgmentSyncClient
+from judgeval.exceptions import JudgmentAPIError
 import os
-from judgeval.common.exceptions import JudgmentAPIError
+from judgeval.logger import judgeval_logger
+from abc import ABC
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
+from copy import copy
+from judgeval.utils.decorators.dont_throw import dont_throw
 def push_prompt_scorer(
     name: str,
     prompt: str,
-    options: Mapping[str, float],
+    threshold: float,
+    options: Optional[Dict[str, float]] = None,
+    model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+    description: Optional[str] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
+    is_trace: bool = False,
 ) -> str:
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        r = client.save_scorer(name, prompt, dict(options))
-    except JudgmentAPIException as e:
-        if e.status_code == 500:
-            raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
-            )
-        raise JudgmentAPIError(f"Failed to save classifier scorer: {e.error_detail}")
-    return r["name"]
+        r = client.save_scorer(
+            payload={
+                "name": name,
+                "prompt": prompt,
+                "threshold": threshold,
+                "options": options,
+                "model": model,
+                "description": description,
+                "is_trace": is_trace,
+            }
+        )
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to save prompt scorer: {e.detail}",
+            response=e.response,
+        )
+    return r["scorer_response"]["name"]
 def fetch_prompt_scorer(
@@ -30,19 +53,26 @@ def fetch_prompt_scorer(
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ):
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        scorer_config = client.fetch_scorer(name)
-        scorer_config.pop("created_at")
-        scorer_config.pop("updated_at")
-        return scorer_config
-    except JudgmentAPIException as e:
-        if e.status_code == 500:
+        fetched_scorers = client.fetch_scorers({"names": [name]})
+        if len(fetched_scorers["scorers"]) == 0:
+            judgeval_logger.error(f"Prompt scorer '{name}' not found")
             raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
+                status_code=404,
+                detail=f"Prompt scorer '{name}' not found",
+                response=None,  # type: ignore
             )
+        else:
+            scorer_config = fetched_scorers["scorers"][0]
+            scorer_config.pop("created_at")
+            scorer_config.pop("updated_at")
+            return scorer_config
+    except JudgmentAPIError as e:
         raise JudgmentAPIError(
-            f"Failed to fetch classifier scorer '{name}': {e.error_detail}"
+            status_code=e.status_code,
+            detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
+            response=e.response,
         )
@@ -51,33 +81,33 @@ def scorer_exists(
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ):
-    client = JudgmentApiClient(judgment_api_key, organization_id)
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        return client.scorer_exists(name)["exists"]
-    except JudgmentAPIException as e:
+        return client.scorer_exists({"name": name})["exists"]
+    except JudgmentAPIError as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
-                f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
+                status_code=e.status_code,
+                detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
+                response=e.response,
             )
-        raise JudgmentAPIError(f"Failed to check if scorer exists: {e.error_detail}")
-class PromptScorer(APIScorerConfig):
-    """
-    In the Judgment backend, this scorer is implemented as a PromptScorer that takes
-    1. a system role that may involve the Example object
-    2. options for scores on the example
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to check if scorer exists: {e.detail}",
+            response=e.response,
+        )
-    and uses a judge to execute the evaluation from the system role and classify into one of the options
-    """
+class BasePromptScorer(ABC, APIScorerConfig):
+    score_type: APIScorerType
     prompt: str
-    options: Mapping[str, float]
-    score_type: APIScorerType = APIScorerType.PROMPT_SCORER
+    options: Optional[Dict[str, float]] = None
+    description: Optional[str] = None
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
     @classmethod
+    @dont_throw
     def get(
         cls,
         name: str,
@@ -85,10 +115,24 @@ class PromptScorer(APIScorerConfig):
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
         scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
+        if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
+            raise JudgmentAPIError(
+                status_code=400,
+                detail=f"Scorer with name {name} is not a {cls.__name__}",
+                response=None,  # type: ignore
+            )
+        if issubclass(cls, TracePromptScorer):
+            score_type = APIScorerType.TRACE_PROMPT_SCORER
+        else:
+            score_type = APIScorerType.PROMPT_SCORER
         return cls(
+            score_type=score_type,
             name=name,
             prompt=scorer_config["prompt"],
-            options=scorer_config["options"],
+            threshold=scorer_config["threshold"],
+            options=scorer_config.get("options"),
+            model=scorer_config.get("model"),
+            description=scorer_config.get("description"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
         )
@@ -98,32 +142,51 @@ class PromptScorer(APIScorerConfig):
         cls,
         name: str,
         prompt: str,
-        options: Mapping[str, float],
+        threshold: float = 0.5,
+        options: Optional[Dict[str, float]] = None,
+        model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+        description: Optional[str] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
         if not scorer_exists(name, judgment_api_key, organization_id):
-            push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
+            if issubclass(cls, TracePromptScorer):
+                is_trace = True
+                score_type = APIScorerType.TRACE_PROMPT_SCORER
+            else:
+                is_trace = False
+                score_type = APIScorerType.PROMPT_SCORER
+            push_prompt_scorer(
+                name,
+                prompt,
+                threshold,
+                options,
+                model,
+                description,
+                judgment_api_key,
+                organization_id,
+                is_trace,
+            )
+            judgeval_logger.info(f"Successfully created PromptScorer: {name}")
             return cls(
+                score_type=score_type,
                 name=name,
                 prompt=prompt,
+                threshold=threshold,
                 options=options,
+                model=model,
+                description=description,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
             )
         else:
             raise JudgmentAPIError(
-                f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name."
+                status_code=400,
+                detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
+                response=None,  # type: ignore
             )
     # Setter functions. Each setter function pushes the scorer to the DB.
-    def set_name(self, name: str):
-        """
-        Updates the name of the scorer.
-        """
-        self.name = name
-        self.push_prompt_scorer()
     def set_threshold(self, threshold: float):
         """
         Updates the threshold of the scorer.
@@ -140,16 +203,31 @@ class PromptScorer(APIScorerConfig):
         """
         self.prompt = prompt
         self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated prompt for {self.name}")
-    def set_options(self, options: Mapping[str, float]):
+    def set_model(self, model: str):
         """
-        Updates the options with the new options.
+        Updates the model of the scorer.
+        """
+        self.model = model
+        self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated model for {self.name}")
-        Sample options:
-        {"yes": 1, "no": 0}
+    def set_options(self, options: Optional[Dict[str, float]]):
+        """
+        Updates the options of the scorer.
         """
         self.options = options
         self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated options for {self.name}")
+    def set_description(self, description: Optional[str]):
+        """
+        Updates the description of the scorer.
+        """
+        self.description = description
+        self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated description for {self.name}")
     def append_to_prompt(self, prompt_addition: str):
         """
@@ -157,21 +235,40 @@ class PromptScorer(APIScorerConfig):
         """
         self.prompt += prompt_addition
         self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
     # Getters
-    def get_prompt(self) -> str | None:
+    def get_threshold(self) -> float:
+        """
+        Returns the threshold of the scorer.
+        """
+        return self.threshold
+    def get_prompt(self) -> str:
         """
         Returns the prompt of the scorer.
         """
         return self.prompt
-    def get_options(self) -> Mapping[str, float] | None:
+    def get_model(self) -> str:
+        """
+        Returns the model of the scorer.
+        """
+        return self.model
+    def get_options(self) -> Dict[str, float] | None:
         """
         Returns the options of the scorer.
         """
-        return self.options
+        return copy(self.options) if self.options is not None else None
+    def get_description(self) -> str | None:
+        """
+        Returns the description of the scorer.
+        """
+        return self.description
-    def get_name(self) -> str | None:
+    def get_name(self) -> str:
         """
         Returns the name of the scorer.
         """
@@ -183,8 +280,11 @@ class PromptScorer(APIScorerConfig):
         """
         return {
             "name": self.name,
+            "model": self.model,
             "prompt": self.prompt,
+            "threshold": self.threshold,
             "options": self.options,
+            "description": self.description,
         }
     def push_prompt_scorer(self):
@@ -194,13 +294,17 @@ class PromptScorer(APIScorerConfig):
         push_prompt_scorer(
             self.name,
             self.prompt,
+            self.threshold,
             self.options,
+            self.model,
+            self.description,
             self.judgment_api_key,
             self.organization_id,
+            isinstance(self, TracePromptScorer),
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
+        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)
@@ -213,3 +317,11 @@ class PromptScorer(APIScorerConfig):
             k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
         }
         return base
+class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
+    pass
+class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
+    pass

judgeval/scorers/score.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Infrastructure for executing evaluations of `Example`s using one or more `BaseScorer`s.
+Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
 """
 import asyncio
@@ -13,61 +13,67 @@ from judgeval.data import (
     generate_scoring_result,
     create_scorer_data,
 )
-from judgeval.scorers import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.scorers.utils import clone_scorers
-from judgeval.common.logger import judgeval_logger
+from judgeval.logger import judgeval_logger
 from judgeval.judges import JudgevalJudge
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: BaseScorer,
+    scorer: ExampleScorer,
     example: Example,
 ):
     """
     Scoring task function when not using a progress indicator!
-    "Safely" scores an `Example` using a `BaseScorer` by gracefully handling any exceptions that may occur.
+    "Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
     Args:
-        scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
+        scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
-        ignore_errors (bool): Whether to ignore errors during the evaluation.
-        If set to false, any error will be raised and stop the evaluation.
-        If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
-        skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
     try:
-        scorer.score = await scorer.a_score_example(example)
+        score = await scorer.a_score_example(example)
+        if score is None:
+            raise Exception("a_score_example need to return a score")
+        elif score < 0:
+            judgeval_logger.warning("score cannot be less than 0 , setting to 0")
+            score = 0
+        elif score > 1:
+            judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
+            score = 1
+        else:
+            scorer.score = score
         scorer.success = scorer.success_check()
     except Exception as e:
         judgeval_logger.error(f"Error during scoring: {str(e)}")
         scorer.error = str(e)
         scorer.success = False
+        scorer.score = 0
         return
 async def a_execute_scoring(
     examples: List[Example],
-    scorers: List[BaseScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+    scorers: List[ExampleScorer],
+    model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
     ignore_errors: bool = False,
     throttle_value: int = 0,
     max_concurrent: int = 100,
+    show_progress: bool = True,
 ) -> List[ScoringResult]:
     """
-    Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
-    Each `Example` will be evaluated by all of the `BaseScorer`s in the `scorers` list.
+    Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
+    Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
     Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
-        scorers (List[BaseScorer]): A list of `BaseScorer` objects to evaluate the examples.
+        scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
         model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
         throttle_value (int): The amount of time to wait between starting each task.
         max_concurrent (int): The maximum number of concurrent tasks.
-        _use_bar_indicator (bool): Whether to use a progress bar indicator.
+        show_progress (bool): Whether to show the progress bar indicator.
     Returns:
         List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
@@ -82,33 +88,50 @@ async def a_execute_scoring(
             except Exception as e:
                 judgeval_logger.error(f"Error executing function: {e}")
                 if kwargs.get("ignore_errors", False):
-                    # Simply return None when ignoring errors, as expected by the test
                     return None
-                # If we're not ignoring errors, propagate the exception
                 raise
-    # Add model to scorers
     for scorer in scorers:
-        if not scorer.model:
+        if not scorer.model and isinstance(model, str):
             scorer._add_model(model)
-    scoring_results: List[ScoringResult] = [None for _ in examples]
+    scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
     tasks = []
-    cloned_scorers: List[BaseScorer]
-    with tqdm_asyncio(
-        desc=f"Evaluating {len(examples)} example(s) in parallel",
-        unit="Example",
-        total=len(examples),
-        bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
-    ) as pbar:
+    if show_progress:
+        with tqdm_asyncio(
+            desc=f"Evaluating {len(examples)} example(s) in parallel",
+            unit="Example",
+            total=len(examples),
+            bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
+        ) as pbar:
+            for i, ex in enumerate(examples):
+                if isinstance(ex, Example):
+                    if len(scorers) == 0:
+                        pbar.update(1)
+                        continue
+                    cloned_scorers = clone_scorers(scorers)  # type: ignore
+                    task = execute_with_semaphore(
+                        func=a_eval_examples_helper,
+                        scorers=cloned_scorers,
+                        example=ex,
+                        scoring_results=scoring_results,
+                        score_index=i,
+                        ignore_errors=ignore_errors,
+                        pbar=pbar,
+                    )
+                    tasks.append(asyncio.create_task(task))
+                await asyncio.sleep(throttle_value)
+            await asyncio.gather(*tasks)
+    else:
         for i, ex in enumerate(examples):
             if isinstance(ex, Example):
                 if len(scorers) == 0:
-                    pbar.update(1)
                     continue
-                cloned_scorers = clone_scorers(scorers)
+                cloned_scorers = clone_scorers(scorers)  # type: ignore
                 task = execute_with_semaphore(
                     func=a_eval_examples_helper,
                     scorers=cloned_scorers,
@@ -116,19 +139,19 @@ async def a_execute_scoring(
                     scoring_results=scoring_results,
                     score_index=i,
                     ignore_errors=ignore_errors,
-                    pbar=pbar,
+                    pbar=None,
                 )
                 tasks.append(asyncio.create_task(task))
             await asyncio.sleep(throttle_value)
         await asyncio.gather(*tasks)
-    return scoring_results
+    return [result for result in scoring_results if result is not None]
 async def a_eval_examples_helper(
-    scorers: List[BaseScorer],
+    scorers: List[ExampleScorer],
     example: Example,
-    scoring_results: List[ScoringResult],
+    scoring_results: List[Optional[ScoringResult]],
     score_index: int,
     ignore_errors: bool,
     pbar: Optional[tqdm_asyncio] = None,
@@ -137,7 +160,7 @@ async def a_eval_examples_helper(
     Evaluate a single example asynchronously using a list of scorers.
     Args:
-        scorers (List[BaseScorer]): List of BaseScorer objects to evaluate the example.
+        scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
         scoring_results (List[ScoringResult]): List to store the scoring results.
         score_index (int): Index at which the result should be stored in scoring_results.
@@ -147,24 +170,18 @@ async def a_eval_examples_helper(
         None
     """
-    # scoring the Example
     scoring_start_time = time.perf_counter()
     tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
     await asyncio.gather(*tasks)
-    # Now that all the scoring functions of each scorer have executed, we collect
-    # the results and update the ScoringResult with the scorer data
     success = True
     scorer_data_list = []
     for scorer in scorers:
-        # At this point, the scorer has been executed and already contains data.
         if getattr(scorer, "skipped", False):
             continue
-        scorer_data = create_scorer_data(
-            scorer
-        )  # Fetch scorer data from completed scorer evaluation
+        scorer_data = create_scorer_data(scorer)
         for s in scorer_data:
             success = success and s.success
         scorer_data_list.extend(scorer_data)

judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl