PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/prompt/__init__.py ADDED Viewed

@@ -0,0 +1,330 @@
+from typing import List, Optional, Dict
+from judgeval.api import JudgmentSyncClient
+from judgeval.exceptions import JudgmentAPIError
+from judgeval.api.api_types import (
+    PromptCommitInfo,
+    PromptTagResponse,
+    PromptUntagResponse,
+    PromptVersionsResponse,
+)
+from dataclasses import dataclass, field
+import re
+from string import Template
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
+from judgeval.utils.project import _resolve_project_id
+def push_prompt(
+    project_name: str,
+    name: str,
+    prompt: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> tuple[str, Optional[str], str]:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        r = client.prompts_insert(
+            payload={
+                "project_id": project_id,
+                "name": name,
+                "prompt": prompt,
+                "tags": tags,
+            }
+        )
+        return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to save prompt: {e.detail}",
+            response=e.response,
+        )
+def fetch_prompt(
+    project_name: str,
+    name: str,
+    commit_id: Optional[str] = None,
+    tag: Optional[str] = None,
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> Optional[PromptCommitInfo]:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_fetch(
+            name=name,
+            project_id=project_id,
+            commit_id=commit_id,
+            tag=tag,
+        )
+        return prompt_config["commit"]
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to fetch prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def tag_prompt(
+    project_name: str,
+    name: str,
+    commit_id: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptTagResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_tag(
+            payload={
+                "project_id": project_id,
+                "name": name,
+                "commit_id": commit_id,
+                "tags": tags,
+            }
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to tag prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def untag_prompt(
+    project_name: str,
+    name: str,
+    tags: List[str],
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptUntagResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_untag(
+            payload={"project_id": project_id, "name": name, "tags": tags}
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to untag prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+def list_prompt(
+    project_name: str,
+    name: str,
+    judgment_api_key: str | None = JUDGMENT_API_KEY,
+    organization_id: str | None = JUDGMENT_ORG_ID,
+) -> PromptVersionsResponse:
+    if not judgment_api_key or not organization_id:
+        raise ValueError("Judgment API key and organization ID are required")
+    client = JudgmentSyncClient(judgment_api_key, organization_id)
+    try:
+        project_id = _resolve_project_id(
+            project_name, judgment_api_key, organization_id
+        )
+        if not project_id:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Project '{project_name}' not found",
+                response=None,  # type: ignore
+            )
+        prompt_config = client.prompts_get_prompt_versions(
+            project_id=project_id, name=name
+        )
+        return prompt_config
+    except JudgmentAPIError as e:
+        raise JudgmentAPIError(
+            status_code=e.status_code,
+            detail=f"Failed to list prompt '{name}': {e.detail}",
+            response=e.response,
+        )
+@dataclass
+class Prompt:
+    name: str
+    prompt: str
+    created_at: str
+    tags: List[str]
+    commit_id: str
+    parent_commit_id: Optional[str] = None
+    metadata: Dict[str, str] = field(default_factory=dict)
+    _template: Template = field(init=False, repr=False)
+    def __post_init__(self):
+        template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
+        self._template = Template(template_str)
+    @classmethod
+    def create(
+        cls,
+        project_name: str,
+        name: str,
+        prompt: str,
+        tags: Optional[List[str]] = None,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        if tags is None:
+            tags = []
+        commit_id, parent_commit_id, created_at = push_prompt(
+            project_name, name, prompt, tags, judgment_api_key, organization_id
+        )
+        return cls(
+            name=name,
+            prompt=prompt,
+            created_at=created_at,
+            tags=tags,
+            commit_id=commit_id,
+            parent_commit_id=parent_commit_id,
+        )
+    @classmethod
+    def get(
+        cls,
+        project_name: str,
+        name: str,
+        commit_id: Optional[str] = None,
+        tag: Optional[str] = None,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        if commit_id is not None and tag is not None:
+            raise ValueError(
+                "You cannot fetch a prompt by both commit_id and tag at the same time"
+            )
+        prompt_config = fetch_prompt(
+            project_name, name, commit_id, tag, judgment_api_key, organization_id
+        )
+        if prompt_config is None:
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Prompt '{name}' not found in project '{project_name}'",
+                response=None,  # type: ignore
+            )
+        return cls(
+            name=prompt_config["name"],
+            prompt=prompt_config["prompt"],
+            created_at=prompt_config["created_at"],
+            tags=prompt_config["tags"],
+            commit_id=prompt_config["commit_id"],
+            parent_commit_id=prompt_config.get("parent_commit_id"),
+            metadata={
+                "creator_first_name": prompt_config["first_name"],
+                "creator_last_name": prompt_config["last_name"],
+                "creator_email": prompt_config["user_email"],
+            },
+        )
+    @classmethod
+    def tag(
+        cls,
+        project_name: str,
+        name: str,
+        commit_id: str,
+        tags: List[str],
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_config = tag_prompt(
+            project_name, name, commit_id, tags, judgment_api_key, organization_id
+        )
+        return prompt_config["commit_id"]
+    @classmethod
+    def untag(
+        cls,
+        project_name: str,
+        name: str,
+        tags: List[str],
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_config = untag_prompt(
+            project_name, name, tags, judgment_api_key, organization_id
+        )
+        return prompt_config["commit_ids"]
+    @classmethod
+    def list(
+        cls,
+        project_name: str,
+        name: str,
+        judgment_api_key: str | None = JUDGMENT_API_KEY,
+        organization_id: str | None = JUDGMENT_ORG_ID,
+    ):
+        prompt_configs = list_prompt(
+            project_name, name, judgment_api_key, organization_id
+        )["versions"]
+        return [
+            cls(
+                name=prompt_config["name"],
+                prompt=prompt_config["prompt"],
+                tags=prompt_config["tags"],
+                created_at=prompt_config["created_at"],
+                commit_id=prompt_config["commit_id"],
+                parent_commit_id=prompt_config.get("parent_commit_id"),
+                metadata={
+                    "creator_first_name": prompt_config["first_name"],
+                    "creator_last_name": prompt_config["last_name"],
+                    "creator_email": prompt_config["user_email"],
+                },
+            )
+            for prompt_config in prompt_configs
+        ]
+    def compile(self, **kwargs) -> str:
+        try:
+            return self._template.substitute(**kwargs)
+        except KeyError as e:
+            missing_var = str(e).strip("'")
+            raise ValueError(f"Missing required variable: {missing_var}")

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,36 +1,29 @@
-from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.scorers.judgeval_scorer import JudgevalScorer
-from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
-from judgeval.scorers.judgeval_scorers import (
-    ToolCorrectnessScorer,
-    JSONCorrectnessScorer,
-    SummarizationScorer,
-    HallucinationScorer,
+from judgeval.scorers.api_scorer import (
+    APIScorerConfig,
+    ExampleAPIScorerConfig,
+    TraceAPIScorerConfig,
+)
+from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
+from judgeval.scorers.judgeval_scorers.api_scorers import (
     FaithfulnessScorer,
-    ContextualRelevancyScorer,
-    ContextualPrecisionScorer,
-    ContextualRecallScorer,
     AnswerRelevancyScorer,
-    ScorerWrapper,
     AnswerCorrectnessScorer,
-    Text2SQLScorer,
+    InstructionAdherenceScorer,
+    TracePromptScorer,
+    PromptScorer,
 )
 __all__ = [
-    "APIJudgmentScorer",
-    "JudgevalScorer",
+    "APIScorerConfig",
+    "ExampleAPIScorerConfig",
+    "TraceAPIScorerConfig",
+    "BaseScorer",
+    "ExampleScorer",
+    "TracePromptScorer",
     "PromptScorer",
-    "ClassifierScorer",
-    "ToolCorrectnessScorer",
-    "JSONCorrectnessScorer",
-    "SummarizationScorer",
-    "HallucinationScorer",
     "FaithfulnessScorer",
-    "ContextualRelevancyScorer",
-    "ContextualPrecisionScorer",
-    "ContextualRecallScorer",
     "AnswerRelevancyScorer",
-    "ScorerWrapper",
     "AnswerCorrectnessScorer",
-    "Text2SQLScorer",
+    "InstructionAdherenceScorer",
 ]

judgeval/scorers/agent_scorer.py ADDED Viewed

@@ -0,0 +1,17 @@
+# from judgeval.scorers.base_scorer import BaseScorer
+# from judgeval.data.judgment_types import Trace as JudgmentTrace
+# from typing import List, Optional
+# from abc import abstractmethod
+# class TraceScorer(BaseScorer):
+#     @abstractmethod
+#     async def a_score_trace(
+#         self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
+#     ) -> float:
+#         """
+#         Asynchronously measures the score on a trace
+#         """
+#         raise NotImplementedError(
+#             "You must implement the `a_score_trace` method in your custom scorer"
+#         )

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -4,61 +4,65 @@ Judgment Scorer class.
 Scores `Example`s using ready-made Judgment evaluators.
 """
-from pydantic import BaseModel, field_validator
-from judgeval.common.logger import debug, info, warning, error
+from __future__ import annotations
-from judgeval.constants import APIScorer
+from pydantic import BaseModel, field_validator
+from typing import List
+from judgeval.constants import APIScorerType
+from judgeval.data.example import ExampleParams
+from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
-class APIJudgmentScorer(BaseModel):
+class APIScorerConfig(BaseModel):
     """
-    Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
+    Scorer config that is used to send to our Judgment server.
     Args:
         score_type (APIScorer): The Judgment metric to use for scoring `Example`s
+        name (str): The name of the scorer, usually this is the same as the score_type
         threshold (float): A value between 0 and 1 that determines the scoring threshold
+        strict_mode (bool): Whether to use strict mode for the scorer
+        required_params (List[ExampleParams]): List of the required parameters on examples for the scorer
+        kwargs (dict): Additional keyword arguments to pass to the scorer
     """
-    threshold: float
-    score_type: APIScorer
-    @field_validator('threshold')
-    def validate_threshold(cls, v):
+    score_type: APIScorerType
+    name: str = ""
+    threshold: float = 0.5
+    strict_mode: bool = False
+    model: str = JUDGMENT_DEFAULT_GPT_MODEL
+    required_params: List[ExampleParams] = []
+    kwargs: dict = {}
+    @field_validator("threshold")
+    @classmethod
+    def validate_threshold(cls, v, info):
         """
         Validates that the threshold is between 0 and 1 inclusive.
         """
+        score_type = info.data.get("score_type")
         if not 0 <= v <= 1:
-            error(f"Threshold must be between 0 and 1, got: {v}")
-            raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
+            raise ValueError(
+                f"Threshold for {score_type} must be between 0 and 1, got: {v}"
+            )
+        return v
+    @field_validator("name", mode="after")
+    @classmethod
+    def set_name_to_score_type_if_none(cls, v, info):
+        if v is None:
+            return info.data.get("score_type")
         return v
-    @field_validator('score_type')
-    def convert_to_enum_value(cls, v):
-        """
-        Validates that the `score_type` is a valid `JudgmentMetric` enum value.
-        Converts string values to `JudgmentMetric` enum values.
-        """
-        debug(f"Attempting to convert score_type value: {v}")
-        if isinstance(v, APIScorer):
-            info(f"Using existing JudgmentMetric: {v.value}")
-            return v.value
-        elif isinstance(v, str):
-            debug(f"Converting string value to JudgmentMetric enum: {v}")
-            return APIScorer[v.upper()].value
-        error(f"Invalid score_type value: {v}")
-        raise ValueError(f"Invalid value for score_type: {v}")
     def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
-    def to_dict(self) -> dict:
-        """
-        Converts the scorer configuration to a dictionary format.
-        Returns:
-            dict: A dictionary containing the scorer's configuration
-        """
-        return {
-            "score_type": self.score_type,
-            "threshold": self.threshold
-        }
+        return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
+class ExampleAPIScorerConfig(APIScorerConfig):
+    pass
+class TraceAPIScorerConfig(APIScorerConfig):
+    pass

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -1,52 +1,97 @@
 """
-Judgment Scorer class.
-Scores `Example`s using ready-made Judgment evaluators.
+Base class for all scorers.
 """
-from pydantic import BaseModel, field_validator
-from judgeval.common.logger import debug, info, warning, error
+from __future__ import annotations
+from typing import Dict, Optional
-from judgeval.constants import APIScorer
+from pydantic import BaseModel
-class APIJudgmentScorer(BaseModel):
-    """
-    Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
+from judgeval.judges.utils import create_judge
+from typing import Any
+from pydantic import model_validator, Field
-    Args:
-        score_type (APIScorer): The Judgment metric to use for scoring `Example`s
-        threshold (float): A value between 0 and 1 that determines the scoring threshold
+class BaseScorer(BaseModel):
     """
-    threshold: float
-    score_type: APIScorer
+    If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
+    you can create a custom scorer by extending this class. This is best used for special use cases
+    where none of Judgment's scorers are suitable.
+    """
+    # type of your scorer (Faithfulness, PromptScorer)
+    score_type: str
+    # The threshold to pass a test while using this scorer as a scorer
+    threshold: float = 0.5
+    # name of your scorer (Faithfulness, PromptScorer-randomslug)
+    name: str = ""
+    # The name of the class of the scorer
+    class_name: Optional[str] = None
+    # The float score of the scorer run on the test case
+    score: Optional[float] = None
+    score_breakdown: Optional[Dict] = None
+    reason: Optional[str] = ""
+    # Whether the model is a native model
+    using_native_model: Optional[bool] = None
-    @field_validator('threshold')
-    def validate_threshold(cls, v):
+    # Whether the test case passed or failed
+    success: bool = False
+    # The name of the model used to evaluate the test case
+    model: Optional[str] = None
+    # The model used to evaluate the test case
+    model_client: Optional[Any] = Field(default=None, exclude=True)
+    # Whether to run the scorer in strict mode
+    strict_mode: bool = False
+    # The error message if the scorer failed
+    error: Optional[str] = None
+    # Additional metadata for the scorer
+    additional_metadata: Optional[Dict] = None
+    # The user ID of the scorer
+    user: Optional[str] = None
+    # Whether the scorer is hosted on the server
+    server_hosted: bool = False
+    @model_validator(mode="after")
+    def enforce_strict_threshold(self):
+        if self.strict_mode:
+            self.threshold = 1.0
+        return self
+    @model_validator(mode="after")
+    def default_name(self):
+        self.class_name = self.__class__.__name__
+        if not self.name:
+            self.name = self.class_name
+        return self
+    def _add_model(self, model: str):
         """
-        Validates that the threshold is between 0 and 1 inclusive.
+        Adds the evaluation model to the BaseScorer instance
+        This method is used at eval time
         """
-        if not 0 <= v <= 1:
-            error(f"Threshold must be between 0 and 1, got: {v}")
-            raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
-        return v
+        self.model_client, self.using_native_model = create_judge(model)
-    @field_validator('score_type')
-    def convert_to_enum_value(cls, v):
+    def success_check(self) -> bool:
         """
-        Validates that the `score_type` is a valid `JudgmentMetric` enum value.
-        Converts string values to `JudgmentMetric` enum values.
+        For unit testing, determines whether the test case passes or fails
         """
-        debug(f"Attempting to convert score_type value: {v}")
-        if isinstance(v, APIScorer):
-            info(f"Using existing JudgmentMetric: {v.value}")
-            return v.value
-        elif isinstance(v, str):
-            debug(f"Converting string value to JudgmentMetric enum: {v}")
-            return APIScorer[v.upper()].value
-        error(f"Invalid score_type value: {v}")
-        raise ValueError(f"Invalid value for score_type: {v}")
-    def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
+        if self.error:
+            return False
+        if self.score is None:
+            return False
+        return self.score >= self.threshold

judgeval/scorers/example_scorer.py ADDED Viewed

@@ -0,0 +1,17 @@
+from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.data import Example
+from typing import List
+from pydantic import Field
+class ExampleScorer(BaseScorer):
+    score_type: str = "Custom"
+    required_params: List[str] = Field(default_factory=list)
+    async def a_score_example(self, example: Example, *args, **kwargs) -> float:
+        """
+        Asynchronously measures the score on a single example
+        """
+        raise NotImplementedError(
+            "You must implement the `a_score_example` method in your custom scorer"
+        )

judgeval/scorers/exceptions.py CHANGED Viewed

@@ -8,4 +8,5 @@ class MissingExampleParamsError(Exception):
     """
     Error raised when a scorer is missing required example parameters.
     """
     pass

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl